github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/grpc/balancer/grpclb/grpclb_remote_balancer.go (about)

     1  /*
     2   *
     3   * Copyright 2017 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package grpclb
    20  
    21  import (
    22  	"context"
    23  	"fmt"
    24  	"io"
    25  	"net"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/golang/protobuf/proto"
    30  	timestamppb "github.com/golang/protobuf/ptypes/timestamp"
    31  	"github.com/google/go-cmp/cmp"
    32  	grpc "github.com/hxx258456/ccgo/grpc"
    33  	"github.com/hxx258456/ccgo/grpc/balancer"
    34  	lbpb "github.com/hxx258456/ccgo/grpc/balancer/grpclb/grpc_lb_v1"
    35  	"github.com/hxx258456/ccgo/grpc/connectivity"
    36  	"github.com/hxx258456/ccgo/grpc/internal/backoff"
    37  	"github.com/hxx258456/ccgo/grpc/internal/channelz"
    38  	imetadata "github.com/hxx258456/ccgo/grpc/internal/metadata"
    39  	"github.com/hxx258456/ccgo/grpc/keepalive"
    40  	"github.com/hxx258456/ccgo/grpc/metadata"
    41  	"github.com/hxx258456/ccgo/grpc/resolver"
    42  )
    43  
    44  // processServerList updates balancer's internal state, create/remove SubConns
    45  // and regenerates picker using the received serverList.
    46  func (lb *lbBalancer) processServerList(l *lbpb.ServerList) {
    47  	if logger.V(2) {
    48  		logger.Infof("lbBalancer: processing server list: %+v", l)
    49  	}
    50  	lb.mu.Lock()
    51  	defer lb.mu.Unlock()
    52  
    53  	// Set serverListReceived to true so fallback will not take effect if it has
    54  	// not hit timeout.
    55  	lb.serverListReceived = true
    56  
    57  	// If the new server list == old server list, do nothing.
    58  	if cmp.Equal(lb.fullServerList, l.Servers, cmp.Comparer(proto.Equal)) {
    59  		if logger.V(2) {
    60  			logger.Infof("lbBalancer: new serverlist same as the previous one, ignoring")
    61  		}
    62  		return
    63  	}
    64  	lb.fullServerList = l.Servers
    65  
    66  	var backendAddrs []resolver.Address
    67  	for i, s := range l.Servers {
    68  		if s.Drop {
    69  			continue
    70  		}
    71  
    72  		md := metadata.Pairs(lbTokenKey, s.LoadBalanceToken)
    73  		ip := net.IP(s.IpAddress)
    74  		ipStr := ip.String()
    75  		if ip.To4() == nil {
    76  			// Add square brackets to ipv6 addresses, otherwise net.Dial() and
    77  			// net.SplitHostPort() will return too many colons error.
    78  			ipStr = fmt.Sprintf("[%s]", ipStr)
    79  		}
    80  		addr := imetadata.Set(resolver.Address{Addr: fmt.Sprintf("%s:%d", ipStr, s.Port)}, md)
    81  		if logger.V(2) {
    82  			logger.Infof("lbBalancer: server list entry[%d]: ipStr:|%s|, port:|%d|, load balancer token:|%v|",
    83  				i, ipStr, s.Port, s.LoadBalanceToken)
    84  		}
    85  		backendAddrs = append(backendAddrs, addr)
    86  	}
    87  
    88  	// Call refreshSubConns to create/remove SubConns.  If we are in fallback,
    89  	// this is also exiting fallback.
    90  	lb.refreshSubConns(backendAddrs, false, lb.usePickFirst)
    91  }
    92  
    93  // refreshSubConns creates/removes SubConns with backendAddrs, and refreshes
    94  // balancer state and picker.
    95  //
    96  // Caller must hold lb.mu.
    97  func (lb *lbBalancer) refreshSubConns(backendAddrs []resolver.Address, fallback bool, pickFirst bool) {
    98  	opts := balancer.NewSubConnOptions{}
    99  	if !fallback {
   100  		opts.CredsBundle = lb.grpclbBackendCreds
   101  	}
   102  
   103  	lb.backendAddrs = backendAddrs
   104  	lb.backendAddrsWithoutMetadata = nil
   105  
   106  	fallbackModeChanged := lb.inFallback != fallback
   107  	lb.inFallback = fallback
   108  	if fallbackModeChanged && lb.inFallback {
   109  		// Clear previous received list when entering fallback, so if the server
   110  		// comes back and sends the same list again, the new addresses will be
   111  		// used.
   112  		lb.fullServerList = nil
   113  	}
   114  
   115  	balancingPolicyChanged := lb.usePickFirst != pickFirst
   116  	oldUsePickFirst := lb.usePickFirst
   117  	lb.usePickFirst = pickFirst
   118  
   119  	if fallbackModeChanged || balancingPolicyChanged {
   120  		// Remove all SubConns when switching balancing policy or switching
   121  		// fallback mode.
   122  		//
   123  		// For fallback mode switching with pickfirst, we want to recreate the
   124  		// SubConn because the creds could be different.
   125  		for a, sc := range lb.subConns {
   126  			if oldUsePickFirst {
   127  				// If old SubConn were created for pickfirst, bypass cache and
   128  				// remove directly.
   129  				lb.cc.cc.RemoveSubConn(sc)
   130  			} else {
   131  				lb.cc.RemoveSubConn(sc)
   132  			}
   133  			delete(lb.subConns, a)
   134  		}
   135  	}
   136  
   137  	if lb.usePickFirst {
   138  		var (
   139  			scKey resolver.Address
   140  			sc    balancer.SubConn
   141  		)
   142  		for scKey, sc = range lb.subConns {
   143  			break
   144  		}
   145  		if sc != nil {
   146  			if len(backendAddrs) == 0 {
   147  				lb.cc.cc.RemoveSubConn(sc)
   148  				delete(lb.subConns, scKey)
   149  				return
   150  			}
   151  			lb.cc.cc.UpdateAddresses(sc, backendAddrs)
   152  			sc.Connect()
   153  			return
   154  		}
   155  		// This bypasses the cc wrapper with SubConn cache.
   156  		sc, err := lb.cc.cc.NewSubConn(backendAddrs, opts)
   157  		if err != nil {
   158  			logger.Warningf("grpclb: failed to create new SubConn: %v", err)
   159  			return
   160  		}
   161  		sc.Connect()
   162  		lb.subConns[backendAddrs[0]] = sc
   163  		lb.scStates[sc] = connectivity.Idle
   164  		return
   165  	}
   166  
   167  	// addrsSet is the set converted from backendAddrsWithoutMetadata, it's used to quick
   168  	// lookup for an address.
   169  	addrsSet := make(map[resolver.Address]struct{})
   170  	// Create new SubConns.
   171  	for _, addr := range backendAddrs {
   172  		addrWithoutAttrs := addr
   173  		addrWithoutAttrs.Attributes = nil
   174  		addrsSet[addrWithoutAttrs] = struct{}{}
   175  		lb.backendAddrsWithoutMetadata = append(lb.backendAddrsWithoutMetadata, addrWithoutAttrs)
   176  
   177  		if _, ok := lb.subConns[addrWithoutAttrs]; !ok {
   178  			// Use addrWithMD to create the SubConn.
   179  			sc, err := lb.cc.NewSubConn([]resolver.Address{addr}, opts)
   180  			if err != nil {
   181  				logger.Warningf("grpclb: failed to create new SubConn: %v", err)
   182  				continue
   183  			}
   184  			lb.subConns[addrWithoutAttrs] = sc // Use the addr without MD as key for the map.
   185  			if _, ok := lb.scStates[sc]; !ok {
   186  				// Only set state of new sc to IDLE. The state could already be
   187  				// READY for cached SubConns.
   188  				lb.scStates[sc] = connectivity.Idle
   189  			}
   190  			sc.Connect()
   191  		}
   192  	}
   193  
   194  	for a, sc := range lb.subConns {
   195  		// a was removed by resolver.
   196  		if _, ok := addrsSet[a]; !ok {
   197  			lb.cc.RemoveSubConn(sc)
   198  			delete(lb.subConns, a)
   199  			// Keep the state of this sc in b.scStates until sc's state becomes Shutdown.
   200  			// The entry will be deleted in UpdateSubConnState.
   201  		}
   202  	}
   203  
   204  	// Regenerate and update picker after refreshing subconns because with
   205  	// cache, even if SubConn was newed/removed, there might be no state
   206  	// changes (the subconn will be kept in cache, not actually
   207  	// newed/removed).
   208  	lb.updateStateAndPicker(true, true)
   209  }
   210  
   211  type remoteBalancerCCWrapper struct {
   212  	cc      *grpc.ClientConn
   213  	lb      *lbBalancer
   214  	backoff backoff.Strategy
   215  	done    chan struct{}
   216  
   217  	streamMu     sync.Mutex
   218  	streamCancel func()
   219  
   220  	// waitgroup to wait for all goroutines to exit.
   221  	wg sync.WaitGroup
   222  }
   223  
   224  func (lb *lbBalancer) newRemoteBalancerCCWrapper() {
   225  	var dopts []grpc.DialOption
   226  	if creds := lb.opt.DialCreds; creds != nil {
   227  		dopts = append(dopts, grpc.WithTransportCredentials(creds))
   228  	} else if bundle := lb.grpclbClientConnCreds; bundle != nil {
   229  		dopts = append(dopts, grpc.WithCredentialsBundle(bundle))
   230  	} else {
   231  		dopts = append(dopts, grpc.WithInsecure())
   232  	}
   233  	if lb.opt.Dialer != nil {
   234  		dopts = append(dopts, grpc.WithContextDialer(lb.opt.Dialer))
   235  	}
   236  	if lb.opt.CustomUserAgent != "" {
   237  		dopts = append(dopts, grpc.WithUserAgent(lb.opt.CustomUserAgent))
   238  	}
   239  	// Explicitly set pickfirst as the balancer.
   240  	dopts = append(dopts, grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy":"pick_first"}`))
   241  	dopts = append(dopts, grpc.WithResolvers(lb.manualResolver))
   242  	if channelz.IsOn() {
   243  		dopts = append(dopts, grpc.WithChannelzParentID(lb.opt.ChannelzParentID))
   244  	}
   245  
   246  	// Enable Keepalive for grpclb client.
   247  	dopts = append(dopts, grpc.WithKeepaliveParams(keepalive.ClientParameters{
   248  		Time:                20 * time.Second,
   249  		Timeout:             10 * time.Second,
   250  		PermitWithoutStream: true,
   251  	}))
   252  
   253  	// The dial target is not important.
   254  	//
   255  	// The grpclb server addresses will set field ServerName, and creds will
   256  	// receive ServerName as authority.
   257  	cc, err := grpc.DialContext(context.Background(), lb.manualResolver.Scheme()+":///grpclb.subClientConn", dopts...)
   258  	if err != nil {
   259  		logger.Fatalf("failed to dial: %v", err)
   260  	}
   261  	ccw := &remoteBalancerCCWrapper{
   262  		cc:      cc,
   263  		lb:      lb,
   264  		backoff: lb.backoff,
   265  		done:    make(chan struct{}),
   266  	}
   267  	lb.ccRemoteLB = ccw
   268  	ccw.wg.Add(1)
   269  	go ccw.watchRemoteBalancer()
   270  }
   271  
   272  // close closed the ClientConn to remote balancer, and waits until all
   273  // goroutines to finish.
   274  func (ccw *remoteBalancerCCWrapper) close() {
   275  	close(ccw.done)
   276  	ccw.cc.Close()
   277  	ccw.wg.Wait()
   278  }
   279  
   280  func (ccw *remoteBalancerCCWrapper) readServerList(s *balanceLoadClientStream) error {
   281  	for {
   282  		reply, err := s.Recv()
   283  		if err != nil {
   284  			if err == io.EOF {
   285  				return errServerTerminatedConnection
   286  			}
   287  			return fmt.Errorf("grpclb: failed to recv server list: %v", err)
   288  		}
   289  		if serverList := reply.GetServerList(); serverList != nil {
   290  			ccw.lb.processServerList(serverList)
   291  		}
   292  		if reply.GetFallbackResponse() != nil {
   293  			// Eagerly enter fallback
   294  			ccw.lb.mu.Lock()
   295  			ccw.lb.refreshSubConns(ccw.lb.resolvedBackendAddrs, true, ccw.lb.usePickFirst)
   296  			ccw.lb.mu.Unlock()
   297  		}
   298  	}
   299  }
   300  
   301  func (ccw *remoteBalancerCCWrapper) sendLoadReport(s *balanceLoadClientStream, interval time.Duration) {
   302  	ticker := time.NewTicker(interval)
   303  	defer ticker.Stop()
   304  	lastZero := false
   305  	for {
   306  		select {
   307  		case <-ticker.C:
   308  		case <-s.Context().Done():
   309  			return
   310  		}
   311  		stats := ccw.lb.clientStats.toClientStats()
   312  		zero := isZeroStats(stats)
   313  		if zero && lastZero {
   314  			// Quash redundant empty load reports.
   315  			continue
   316  		}
   317  		lastZero = zero
   318  		t := time.Now()
   319  		stats.Timestamp = &timestamppb.Timestamp{
   320  			Seconds: t.Unix(),
   321  			Nanos:   int32(t.Nanosecond()),
   322  		}
   323  		if err := s.Send(&lbpb.LoadBalanceRequest{
   324  			LoadBalanceRequestType: &lbpb.LoadBalanceRequest_ClientStats{
   325  				ClientStats: stats,
   326  			},
   327  		}); err != nil {
   328  			return
   329  		}
   330  	}
   331  }
   332  
   333  func (ccw *remoteBalancerCCWrapper) callRemoteBalancer(ctx context.Context) (backoff bool, _ error) {
   334  	lbClient := &loadBalancerClient{cc: ccw.cc}
   335  	stream, err := lbClient.BalanceLoad(ctx, grpc.WaitForReady(true))
   336  	if err != nil {
   337  		return true, fmt.Errorf("grpclb: failed to perform RPC to the remote balancer %v", err)
   338  	}
   339  	ccw.lb.mu.Lock()
   340  	ccw.lb.remoteBalancerConnected = true
   341  	ccw.lb.mu.Unlock()
   342  
   343  	// grpclb handshake on the stream.
   344  	initReq := &lbpb.LoadBalanceRequest{
   345  		LoadBalanceRequestType: &lbpb.LoadBalanceRequest_InitialRequest{
   346  			InitialRequest: &lbpb.InitialLoadBalanceRequest{
   347  				Name: ccw.lb.target,
   348  			},
   349  		},
   350  	}
   351  	if err := stream.Send(initReq); err != nil {
   352  		return true, fmt.Errorf("grpclb: failed to send init request: %v", err)
   353  	}
   354  	reply, err := stream.Recv()
   355  	if err != nil {
   356  		return true, fmt.Errorf("grpclb: failed to recv init response: %v", err)
   357  	}
   358  	initResp := reply.GetInitialResponse()
   359  	if initResp == nil {
   360  		return true, fmt.Errorf("grpclb: reply from remote balancer did not include initial response")
   361  	}
   362  
   363  	ccw.wg.Add(1)
   364  	go func() {
   365  		defer ccw.wg.Done()
   366  		if d := convertDuration(initResp.ClientStatsReportInterval); d > 0 {
   367  			ccw.sendLoadReport(stream, d)
   368  		}
   369  	}()
   370  	// No backoff if init req/resp handshake was successful.
   371  	return false, ccw.readServerList(stream)
   372  }
   373  
   374  // cancelRemoteBalancerCall cancels the context used by the stream to the remote
   375  // balancer. watchRemoteBalancer() takes care of restarting this call after the
   376  // stream fails.
   377  func (ccw *remoteBalancerCCWrapper) cancelRemoteBalancerCall() {
   378  	ccw.streamMu.Lock()
   379  	if ccw.streamCancel != nil {
   380  		ccw.streamCancel()
   381  		ccw.streamCancel = nil
   382  	}
   383  	ccw.streamMu.Unlock()
   384  }
   385  
   386  func (ccw *remoteBalancerCCWrapper) watchRemoteBalancer() {
   387  	defer func() {
   388  		ccw.wg.Done()
   389  		ccw.streamMu.Lock()
   390  		if ccw.streamCancel != nil {
   391  			// This is to make sure that we don't leak the context when we are
   392  			// directly returning from inside of the below `for` loop.
   393  			ccw.streamCancel()
   394  			ccw.streamCancel = nil
   395  		}
   396  		ccw.streamMu.Unlock()
   397  	}()
   398  
   399  	var retryCount int
   400  	var ctx context.Context
   401  	for {
   402  		ccw.streamMu.Lock()
   403  		if ccw.streamCancel != nil {
   404  			ccw.streamCancel()
   405  			ccw.streamCancel = nil
   406  		}
   407  		ctx, ccw.streamCancel = context.WithCancel(context.Background())
   408  		ccw.streamMu.Unlock()
   409  
   410  		doBackoff, err := ccw.callRemoteBalancer(ctx)
   411  		select {
   412  		case <-ccw.done:
   413  			return
   414  		default:
   415  			if err != nil {
   416  				if err == errServerTerminatedConnection {
   417  					logger.Info(err)
   418  				} else {
   419  					logger.Warning(err)
   420  				}
   421  			}
   422  		}
   423  		// Trigger a re-resolve when the stream errors.
   424  		ccw.lb.cc.cc.ResolveNow(resolver.ResolveNowOptions{})
   425  
   426  		ccw.lb.mu.Lock()
   427  		ccw.lb.remoteBalancerConnected = false
   428  		ccw.lb.fullServerList = nil
   429  		// Enter fallback when connection to remote balancer is lost, and the
   430  		// aggregated state is not Ready.
   431  		if !ccw.lb.inFallback && ccw.lb.state != connectivity.Ready {
   432  			// Entering fallback.
   433  			ccw.lb.refreshSubConns(ccw.lb.resolvedBackendAddrs, true, ccw.lb.usePickFirst)
   434  		}
   435  		ccw.lb.mu.Unlock()
   436  
   437  		if !doBackoff {
   438  			retryCount = 0
   439  			continue
   440  		}
   441  
   442  		timer := time.NewTimer(ccw.backoff.Backoff(retryCount)) // Copy backoff
   443  		select {
   444  		case <-timer.C:
   445  		case <-ccw.done:
   446  			timer.Stop()
   447  			return
   448  		}
   449  		retryCount++
   450  	}
   451  }