vitess.io/vitess@v0.16.2/go/vt/vttablet/grpctmclient/cached_client.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package grpctmclient
    18  
    19  import (
    20  	"context"
    21  	"io"
    22  	"sort"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/spf13/pflag"
    27  	"google.golang.org/grpc"
    28  
    29  	"vitess.io/vitess/go/netutil"
    30  	"vitess.io/vitess/go/stats"
    31  	"vitess.io/vitess/go/sync2"
    32  	"vitess.io/vitess/go/vt/grpcclient"
    33  	"vitess.io/vitess/go/vt/servenv"
    34  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    35  
    36  	tabletmanagerservicepb "vitess.io/vitess/go/vt/proto/tabletmanagerservice"
    37  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    38  )
    39  
    40  var defaultPoolCapacity = 100
    41  
    42  func registerCachedClientFlags(fs *pflag.FlagSet) {
    43  	fs.IntVar(&defaultPoolCapacity, "tablet_manager_grpc_connpool_size", defaultPoolCapacity, "number of tablets to keep tmclient connections open to")
    44  }
    45  
    46  func init() {
    47  	tmclient.RegisterTabletManagerClientFactory("grpc-cached", func() tmclient.TabletManagerClient {
    48  		return NewCachedConnClient(defaultPoolCapacity)
    49  	})
    50  
    51  	for _, cmd := range _binaries {
    52  		servenv.OnParseFor(cmd, registerCachedClientFlags)
    53  	}
    54  }
    55  
    56  // closeFunc allows a standalone function to implement io.Closer, similar to
    57  // how http.HandlerFunc allows standalone functions to implement http.Handler.
    58  type closeFunc func() error
    59  
    60  func (fn closeFunc) Close() error {
    61  	return fn()
    62  }
    63  
    64  var _ io.Closer = (*closeFunc)(nil)
    65  
    66  type cachedConn struct {
    67  	tabletmanagerservicepb.TabletManagerClient
    68  	cc *grpc.ClientConn
    69  
    70  	addr           string
    71  	lastAccessTime time.Time
    72  	refs           int
    73  }
    74  
    75  type cachedConnDialer struct {
    76  	m            sync.Mutex
    77  	conns        map[string]*cachedConn
    78  	evict        []*cachedConn
    79  	evictSorted  bool
    80  	connWaitSema *sync2.Semaphore
    81  	capacity     int
    82  }
    83  
    84  var dialerStats = struct {
    85  	ConnReuse    *stats.Gauge
    86  	ConnNew      *stats.Gauge
    87  	DialTimeouts *stats.Gauge
    88  	DialTimings  *stats.Timings
    89  }{
    90  	ConnReuse:    stats.NewGauge("tabletmanagerclient_cachedconn_reuse", "number of times a call to dial() was able to reuse an existing connection"),
    91  	ConnNew:      stats.NewGauge("tabletmanagerclient_cachedconn_new", "number of times a call to dial() resulted in a dialing a new grpc clientconn"),
    92  	DialTimeouts: stats.NewGauge("tabletmanagerclient_cachedconn_dial_timeouts", "number of context timeouts during dial()"),
    93  	DialTimings:  stats.NewTimings("tabletmanagerclient_cachedconn_dial_timings", "timings for various dial paths", "path", "cache_fast", "sema_fast", "sema_poll"),
    94  }
    95  
    96  // NewCachedConnClient returns a grpc Client that caches connections to the
    97  // different tablets.
    98  func NewCachedConnClient(capacity int) *Client {
    99  	dialer := &cachedConnDialer{
   100  		conns:        make(map[string]*cachedConn, capacity),
   101  		evict:        make([]*cachedConn, 0, capacity),
   102  		connWaitSema: sync2.NewSemaphore(capacity, 0),
   103  		capacity:     capacity,
   104  	}
   105  	return &Client{dialer}
   106  }
   107  
   108  var _ dialer = (*cachedConnDialer)(nil)
   109  
   110  func (dialer *cachedConnDialer) sortEvictionsLocked() {
   111  	if !dialer.evictSorted {
   112  		sort.Slice(dialer.evict, func(i, j int) bool {
   113  			left, right := dialer.evict[i], dialer.evict[j]
   114  			if left.refs == right.refs {
   115  				return right.lastAccessTime.After(left.lastAccessTime)
   116  			}
   117  			return right.refs > left.refs
   118  		})
   119  		dialer.evictSorted = true
   120  	}
   121  }
   122  
   123  func (dialer *cachedConnDialer) dial(ctx context.Context, tablet *topodatapb.Tablet) (tabletmanagerservicepb.TabletManagerClient, io.Closer, error) {
   124  	start := time.Now()
   125  	addr := getTabletAddr(tablet)
   126  
   127  	if client, closer, found, err := dialer.tryFromCache(addr, &dialer.m); found {
   128  		dialerStats.DialTimings.Add("cache_fast", time.Since(start))
   129  		return client, closer, err
   130  	}
   131  
   132  	if dialer.connWaitSema.TryAcquire() {
   133  		defer func() {
   134  			dialerStats.DialTimings.Add("sema_fast", time.Since(start))
   135  		}()
   136  
   137  		// Check if another goroutine managed to dial a conn for the same addr
   138  		// while we were waiting for the write lock. This is identical to the
   139  		// read-lock section above, except we release the connWaitSema if we
   140  		// are able to use the cache, allowing another goroutine to dial a new
   141  		// conn instead.
   142  		if client, closer, found, err := dialer.tryFromCache(addr, &dialer.m); found {
   143  			dialer.connWaitSema.Release()
   144  			return client, closer, err
   145  		}
   146  		return dialer.newdial(ctx, addr)
   147  	}
   148  
   149  	defer func() {
   150  		dialerStats.DialTimings.Add("sema_poll", time.Since(start))
   151  	}()
   152  
   153  	for {
   154  		select {
   155  		case <-ctx.Done():
   156  			dialerStats.DialTimeouts.Add(1)
   157  			return nil, nil, ctx.Err()
   158  		default:
   159  			if client, closer, found, err := dialer.pollOnce(ctx, addr); found {
   160  				return client, closer, err
   161  			}
   162  		}
   163  	}
   164  }
   165  
   166  // tryFromCache tries to get a connection from the cache, performing a redial
   167  // on that connection if it exists. It returns a TabletManagerClient impl, an
   168  // io.Closer, a flag to indicate whether a connection was found in the cache,
   169  // and an error, which is always nil.
   170  //
   171  // In addition to the addr being dialed, tryFromCache takes a sync.Locker which,
   172  // if not nil, will be used to wrap the lookup and redial in that lock. This
   173  // function can be called in situations where the conns map is locked
   174  // externally (like in pollOnce), so we do not want to manage the locks here. In
   175  // other cases (like in the cache_fast path of dial()), we pass in the dialer.m
   176  // to ensure we have a lock on the cache for the duration of the call.
   177  func (dialer *cachedConnDialer) tryFromCache(addr string, locker sync.Locker) (client tabletmanagerservicepb.TabletManagerClient, closer io.Closer, found bool, err error) {
   178  	if locker != nil {
   179  		locker.Lock()
   180  		defer locker.Unlock()
   181  	}
   182  
   183  	if conn, ok := dialer.conns[addr]; ok {
   184  		client, closer, err := dialer.redialLocked(conn)
   185  		return client, closer, ok, err
   186  	}
   187  
   188  	return nil, nil, false, nil
   189  }
   190  
   191  // pollOnce is called on each iteration of the polling loop in dial(). It:
   192  //   - locks the conns cache for writes
   193  //   - attempts to get a connection from the cache. If found, redial() it and exit.
   194  //   - peeks at the head of the eviction queue. if the peeked conn has no refs, it
   195  //     is unused, and can be evicted to make room for the new connection to addr.
   196  //     If the peeked conn has refs, exit.
   197  //   - pops the conn we just peeked from the queue, deletes it from the cache, and
   198  //     close the underlying ClientConn for that conn.
   199  //   - attempt a newdial. if the newdial fails, it will release a slot on the
   200  //     connWaitSema, so another dial() call can successfully acquire it to dial
   201  //     a new conn. if the newdial succeeds, we will have evicted one conn, but
   202  //     added another, so the net change is 0, and no changes to the connWaitSema
   203  //     are made.
   204  //
   205  // It returns a TabletManagerClient impl, an io.Closer, a flag to indicate
   206  // whether the dial() poll loop should exit, and an error.
   207  func (dialer *cachedConnDialer) pollOnce(ctx context.Context, addr string) (client tabletmanagerservicepb.TabletManagerClient, closer io.Closer, found bool, err error) {
   208  	dialer.m.Lock()
   209  
   210  	if client, closer, found, err := dialer.tryFromCache(addr, nil); found {
   211  		dialer.m.Unlock()
   212  		return client, closer, found, err
   213  	}
   214  
   215  	dialer.sortEvictionsLocked()
   216  
   217  	conn := dialer.evict[0]
   218  	if conn.refs != 0 {
   219  		dialer.m.Unlock()
   220  		return nil, nil, false, nil
   221  	}
   222  
   223  	dialer.evict = dialer.evict[1:]
   224  	delete(dialer.conns, conn.addr)
   225  	conn.cc.Close()
   226  	dialer.m.Unlock()
   227  
   228  	client, closer, err = dialer.newdial(ctx, addr)
   229  	return client, closer, true, err
   230  }
   231  
   232  // newdial creates a new cached connection, and updates the cache and eviction
   233  // queue accordingly. If newdial fails to create the underlying
   234  // gRPC connection, it will make a call to Release the connWaitSema for other
   235  // newdial calls.
   236  //
   237  // It returns the three-tuple of client-interface, closer, and error that the
   238  // main dial func returns.
   239  func (dialer *cachedConnDialer) newdial(ctx context.Context, addr string) (tabletmanagerservicepb.TabletManagerClient, io.Closer, error) {
   240  	opt, err := grpcclient.SecureDialOption(cert, key, ca, crl, name)
   241  	if err != nil {
   242  		dialer.connWaitSema.Release()
   243  		return nil, nil, err
   244  	}
   245  
   246  	cc, err := grpcclient.DialContext(ctx, addr, grpcclient.FailFast(false), opt)
   247  	if err != nil {
   248  		dialer.connWaitSema.Release()
   249  		return nil, nil, err
   250  	}
   251  
   252  	dialer.m.Lock()
   253  	defer dialer.m.Unlock()
   254  
   255  	if conn, existing := dialer.conns[addr]; existing {
   256  		// race condition: some other goroutine has dialed our tablet before we have;
   257  		// this is not great, but shouldn't happen often (if at all), so we're going to
   258  		// close this connection and reuse the existing one. by doing this, we can keep
   259  		// the actual Dial out of the global lock and significantly increase throughput
   260  		cc.Close()
   261  		dialer.connWaitSema.Release()
   262  		return dialer.redialLocked(conn)
   263  	}
   264  
   265  	dialerStats.ConnNew.Add(1)
   266  
   267  	conn := &cachedConn{
   268  		TabletManagerClient: tabletmanagerservicepb.NewTabletManagerClient(cc),
   269  		cc:                  cc,
   270  		lastAccessTime:      time.Now(),
   271  		refs:                1,
   272  		addr:                addr,
   273  	}
   274  
   275  	// NOTE: we deliberately do not set dialer.evictSorted=false here. Since
   276  	// cachedConns are evicted from the front of the queue, and we are appending
   277  	// to the end, if there is already a second evictable connection, it will be
   278  	// at the front of the queue, so we can speed up the edge case where we need
   279  	// to evict multiple connections in a row.
   280  	dialer.evict = append(dialer.evict, conn)
   281  	dialer.conns[addr] = conn
   282  
   283  	return dialer.connWithCloser(conn)
   284  }
   285  
   286  // redialLocked takes an already-dialed connection in the cache does all the
   287  // work of lending that connection out to one more caller. It returns the
   288  // three-tuple of client-interface, closer, and error that the main dial func
   289  // returns.
   290  func (dialer *cachedConnDialer) redialLocked(conn *cachedConn) (tabletmanagerservicepb.TabletManagerClient, io.Closer, error) {
   291  	dialerStats.ConnReuse.Add(1)
   292  	conn.lastAccessTime = time.Now()
   293  	conn.refs++
   294  	dialer.evictSorted = false
   295  	return dialer.connWithCloser(conn)
   296  }
   297  
   298  // connWithCloser returns the three-tuple expected by the main dial func, where
   299  // the closer handles the correct state management for updating the conns place
   300  // in the eviction queue.
   301  func (dialer *cachedConnDialer) connWithCloser(conn *cachedConn) (tabletmanagerservicepb.TabletManagerClient, io.Closer, error) {
   302  	return conn, closeFunc(func() error {
   303  		dialer.m.Lock()
   304  		defer dialer.m.Unlock()
   305  		conn.refs--
   306  		dialer.evictSorted = false
   307  		return nil
   308  	}), nil
   309  }
   310  
   311  // Close closes all currently cached connections, ***regardless of whether
   312  // those connections are in use***. Calling Close therefore will fail any RPCs
   313  // using currently lent-out connections, and, furthermore, will invalidate the
   314  // io.Closer that was returned for that connection from dialer.dial(). When
   315  // calling those io.Closers, they will still lock the dialer's mutex, and then
   316  // perform needless operations that will slow down dial throughput, but not
   317  // actually impact the correctness of the internal state of the dialer.
   318  //
   319  // As a result, while it is safe to reuse a cachedConnDialer after calling Close,
   320  // it will be less performant than getting a new one, either by calling
   321  // tmclient.TabletManagerClient() with
   322  // TabletManagerProtocol set to "grpc-cached", or by calling
   323  // grpctmclient.NewCachedConnClient directly.
   324  func (dialer *cachedConnDialer) Close() {
   325  	dialer.m.Lock()
   326  	defer dialer.m.Unlock()
   327  
   328  	for _, conn := range dialer.evict {
   329  		conn.cc.Close()
   330  		delete(dialer.conns, conn.addr)
   331  		dialer.connWaitSema.Release()
   332  	}
   333  	dialer.evict = make([]*cachedConn, 0, dialer.capacity)
   334  }
   335  
   336  func getTabletAddr(tablet *topodatapb.Tablet) string {
   337  	return netutil.JoinHostPort(tablet.Hostname, int32(tablet.PortMap["grpc"]))
   338  }