github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/cluster/interceptors.go (about)

     1  // Copyright 2022 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cluster
    16  
    17  import (
    18  	"context"
    19  	"strconv"
    20  	"strings"
    21  	"sync"
    22  	"time"
    23  
    24  	"github.com/sirupsen/logrus"
    25  	"google.golang.org/grpc"
    26  	"google.golang.org/grpc/codes"
    27  	"google.golang.org/grpc/metadata"
    28  	"google.golang.org/grpc/status"
    29  	"gopkg.in/square/go-jose.v2/jwt"
    30  
    31  	"github.com/dolthub/dolt/go/libraries/utils/jwtauth"
    32  )
    33  
    34  const clusterRoleHeader = "x-dolt-cluster-role"
    35  const clusterRoleEpochHeader = "x-dolt-cluster-role-epoch"
    36  
    37  var writeEndpoints map[string]bool
    38  
    39  func init() {
    40  	writeEndpoints = make(map[string]bool)
    41  	writeEndpoints["/dolt.services.remotesapi.v1alpha1.ChunkStoreService/Commit"] = true
    42  	writeEndpoints["/dolt.services.remotesapi.v1alpha1.ChunkStoreService/AddTableFiles"] = true
    43  	writeEndpoints["/dolt.services.remotesapi.v1alpha1.ChunkStoreService/GetUploadLocations"] = true
    44  }
    45  
    46  func isLikelyServerResponse(err error) bool {
    47  	code := status.Code(err)
    48  	switch code {
    49  	case codes.Unavailable:
    50  		fallthrough
    51  	case codes.DeadlineExceeded:
    52  		fallthrough
    53  	case codes.Canceled:
    54  		return false
    55  	default:
    56  		return true
    57  	}
    58  }
    59  
    60  // clientinterceptor is installed as a Unary and Stream client interceptor on
    61  // the client conns that are used to communicate with standby remotes. The
    62  // cluster.Controller sets this server's current Role and role epoch on the
    63  // interceptor anytime it changes. In turn, this interceptor:
    64  // * adds the server's current role and epoch to the request headers for every
    65  // outbound request.
    66  // * fails all outgoing requests immediately with codes.FailedPrecondition if
    67  // the role == RoleStandby, since this server should not be replicating when it
    68  // believes it is a standby.
    69  // * watches returned response headers for a situation which causes this server
    70  // to force downgrade from primary to standby. In particular, when a returned
    71  // response header asserts that the standby replica is a primary at a higher
    72  // epoch than this server, this incterceptor coordinates with the Controller to
    73  // immediately transition to standby and to stop replicating to the standby.
    74  type clientinterceptor struct {
    75  	lgr        *logrus.Entry
    76  	role       Role
    77  	epoch      int
    78  	mu         sync.Mutex
    79  	roleSetter func(role string, epoch int)
    80  }
    81  
    82  func (ci *clientinterceptor) setRole(role Role, epoch int) {
    83  	ci.mu.Lock()
    84  	defer ci.mu.Unlock()
    85  	ci.role = role
    86  	ci.epoch = epoch
    87  }
    88  
    89  func (ci *clientinterceptor) getRole() (Role, int) {
    90  	ci.mu.Lock()
    91  	defer ci.mu.Unlock()
    92  	return ci.role, ci.epoch
    93  }
    94  
    95  func (ci *clientinterceptor) Stream() grpc.StreamClientInterceptor {
    96  	return func(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) {
    97  		role, epoch := ci.getRole()
    98  		ci.lgr.Tracef("cluster: clientinterceptor: processing request to %s, role %s", method, string(role))
    99  		if role == RoleStandby {
   100  			return nil, status.Error(codes.FailedPrecondition, "cluster: clientinterceptor: this server is a standby and is not currently replicating to its standby")
   101  		}
   102  		if role == RoleDetectedBrokenConfig {
   103  			return nil, status.Error(codes.FailedPrecondition, "cluster: clientinterceptor: this server is in detected_broken_config and is not currently replicating to its standby")
   104  		}
   105  		ctx = metadata.AppendToOutgoingContext(ctx, clusterRoleHeader, string(role), clusterRoleEpochHeader, strconv.Itoa(epoch))
   106  		var header metadata.MD
   107  		stream, err := streamer(ctx, desc, cc, method, append(opts, grpc.Header(&header))...)
   108  		ci.handleResponseHeaders(header, err)
   109  		return stream, err
   110  	}
   111  }
   112  
   113  func (ci *clientinterceptor) Unary() grpc.UnaryClientInterceptor {
   114  	return func(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
   115  		role, epoch := ci.getRole()
   116  		ci.lgr.Tracef("cluster: clientinterceptor: processing request to %s, role %s", method, string(role))
   117  		if role == RoleStandby {
   118  			return status.Error(codes.FailedPrecondition, "cluster: clientinterceptor: this server is a standby and is not currently replicating to its standby")
   119  		}
   120  		if role == RoleDetectedBrokenConfig {
   121  			return status.Error(codes.FailedPrecondition, "cluster: clientinterceptor: this server is in detected_broken_config and is not currently replicating to its standby")
   122  		}
   123  		ctx = metadata.AppendToOutgoingContext(ctx, clusterRoleHeader, string(role), clusterRoleEpochHeader, strconv.Itoa(epoch))
   124  		var header metadata.MD
   125  		err := invoker(ctx, method, req, reply, cc, append(opts, grpc.Header(&header))...)
   126  		ci.handleResponseHeaders(header, err)
   127  		return err
   128  	}
   129  }
   130  
   131  func (ci *clientinterceptor) handleResponseHeaders(header metadata.MD, err error) {
   132  	role, epoch := ci.getRole()
   133  	if role != RolePrimary {
   134  		// By the time we process this response, we were no longer a primary.
   135  		return
   136  	}
   137  	respEpochs := header.Get(clusterRoleEpochHeader)
   138  	respRoles := header.Get(clusterRoleHeader)
   139  	if len(respEpochs) > 0 && len(respRoles) > 0 {
   140  		respRole := respRoles[0]
   141  		respEpoch, err := strconv.Atoi(respEpochs[0])
   142  		if err == nil {
   143  			if respRole == string(RolePrimary) {
   144  				if respEpoch == epoch {
   145  					ci.lgr.Errorf("cluster: clientinterceptor: this server and the server replicating to it are both primary at the same epoch. force transitioning to detected_broken_config.")
   146  					ci.roleSetter(string(RoleDetectedBrokenConfig), respEpoch)
   147  				} else if respEpoch > epoch {
   148  					// The server we replicate to thinks it is the primary at a higher epoch than us...
   149  					ci.lgr.Warnf("cluster: clientinterceptor: this server is primary at epoch %d. a server it attempted to replicate to is primary at epoch %d. force transitioning to standby.", epoch, respEpoch)
   150  					ci.roleSetter(string(RoleStandby), respEpoch)
   151  				}
   152  			} else if respRole == string(RoleDetectedBrokenConfig) && respEpoch >= epoch {
   153  				ci.lgr.Errorf("cluster: clientinterceptor: this server learned from its standby that the standby is in detected_broken_config at the same or higher epoch. force transitioning to detected_broken_config.")
   154  				ci.roleSetter(string(RoleDetectedBrokenConfig), respEpoch)
   155  			}
   156  		} else {
   157  			ci.lgr.Errorf("cluster: clientinterceptor: failed to parse epoch in response header; something is wrong: %v", err)
   158  		}
   159  	} else if isLikelyServerResponse(err) {
   160  		ci.lgr.Warnf("cluster: clientinterceptor: response was missing role and epoch metadata")
   161  	}
   162  }
   163  
   164  func (ci *clientinterceptor) Options() []grpc.DialOption {
   165  	return []grpc.DialOption{
   166  		grpc.WithChainUnaryInterceptor(ci.Unary()),
   167  		grpc.WithChainStreamInterceptor(ci.Stream()),
   168  	}
   169  }
   170  
   171  // serverinterceptor is installed as a Unary and Stream interceptor on a
   172  // ChunkStoreServer which is serving a SQL database as a standby remote. The
   173  // cluster.Controller sets this server's current Role and role epoch on the
   174  // interceptor anytime it changes. In turn, this interceptor has the following
   175  // behavior:
   176  // * for any incoming standby traffic, it will add the server's current role
   177  // and epoch to the response headers for every request.
   178  // * for any incoming standby traffic, it will fail incoming requests
   179  // immediately with codes.FailedPrecondition if the current role !=
   180  // RoleStandby, since nothing should be replicating to us in that state.
   181  // * watches incoming request headers for a situation which causes this server
   182  // to force downgrade from primary to standby. In particular, when an incoming
   183  // request asserts that the client is the current primary at an epoch higher
   184  // than our current epoch, this interceptor coordinates with the Controller to
   185  // immediately transition to standby and allow replication requests through.
   186  // * for incoming requests which are not standby, it will currently fail the
   187  // requests with codes.Unauthenticated. Eventually, it will allow read-only
   188  // traffic through which is authenticated and authorized.
   189  //
   190  // The serverinterceptor is responsible for authenticating incoming requests
   191  // from standby replicas. It is instantiated with a jwtauth.KeyProvider and
   192  // some jwt.Expected. Incoming requests must have a valid, unexpired, signed
   193  // JWT, signed by a key accessible in the KeyProvider.
   194  type serverinterceptor struct {
   195  	lgr        *logrus.Entry
   196  	role       Role
   197  	epoch      int
   198  	mu         sync.Mutex
   199  	roleSetter func(role string, epoch int)
   200  
   201  	keyProvider jwtauth.KeyProvider
   202  	jwtExpected jwt.Expected
   203  }
   204  
   205  func (si *serverinterceptor) Stream() grpc.StreamServerInterceptor {
   206  	return func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error {
   207  		fromClusterMember := false
   208  		if md, ok := metadata.FromIncomingContext(ss.Context()); ok {
   209  			fromClusterMember = si.handleRequestHeaders(md)
   210  		}
   211  		if fromClusterMember {
   212  			if err := si.authenticate(ss.Context()); err != nil {
   213  				return err
   214  			}
   215  			// After handleRequestHeaders, our role may have changed, so we fetch it again here.
   216  			role, epoch := si.getRole()
   217  			if err := grpc.SetHeader(ss.Context(), metadata.Pairs(clusterRoleHeader, string(role), clusterRoleEpochHeader, strconv.Itoa(epoch))); err != nil {
   218  				return err
   219  			}
   220  			if role == RolePrimary {
   221  				// As a primary, we do not accept replication requests.
   222  				return status.Error(codes.FailedPrecondition, "this server is a primary and is not currently accepting replication")
   223  			}
   224  			if role == RoleDetectedBrokenConfig {
   225  				// In detected_brokne_config we do not accept replication requests.
   226  				return status.Error(codes.FailedPrecondition, "this server is currently in detected_broken_config and is not currently accepting replication")
   227  			}
   228  			return handler(srv, ss)
   229  		} else if isWrite := writeEndpoints[info.FullMethod]; isWrite {
   230  			return status.Error(codes.Unimplemented, "unimplemented")
   231  		} else {
   232  			return status.Error(codes.Unauthenticated, "unauthenticated")
   233  		}
   234  	}
   235  }
   236  
   237  func (si *serverinterceptor) Unary() grpc.UnaryServerInterceptor {
   238  	return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) {
   239  		fromClusterMember := false
   240  		if md, ok := metadata.FromIncomingContext(ctx); ok {
   241  			fromClusterMember = si.handleRequestHeaders(md)
   242  		}
   243  		if fromClusterMember {
   244  			if err := si.authenticate(ctx); err != nil {
   245  				return nil, err
   246  			}
   247  			// After handleRequestHeaders, our role may have changed, so we fetch it again here.
   248  			role, epoch := si.getRole()
   249  			if err := grpc.SetHeader(ctx, metadata.Pairs(clusterRoleHeader, string(role), clusterRoleEpochHeader, strconv.Itoa(epoch))); err != nil {
   250  				return nil, err
   251  			}
   252  			if role == RolePrimary {
   253  				// As a primary, we do not accept replication requests.
   254  				return nil, status.Error(codes.FailedPrecondition, "this server is a primary and is not currently accepting replication")
   255  			}
   256  			if role == RoleDetectedBrokenConfig {
   257  				// In detected_broken_config we do not accept replication requests.
   258  				return nil, status.Error(codes.FailedPrecondition, "this server is currently in detected_broken_config and is not currently accepting replication")
   259  			}
   260  			return handler(ctx, req)
   261  		} else if isWrite := writeEndpoints[info.FullMethod]; isWrite {
   262  			return nil, status.Error(codes.Unimplemented, "unimplemented")
   263  		} else {
   264  			return nil, status.Error(codes.Unauthenticated, "unauthenticated")
   265  		}
   266  	}
   267  }
   268  
   269  func (si *serverinterceptor) handleRequestHeaders(header metadata.MD) bool {
   270  	role, epoch := si.getRole()
   271  	epochs := header.Get(clusterRoleEpochHeader)
   272  	roles := header.Get(clusterRoleHeader)
   273  	if len(epochs) > 0 && len(roles) > 0 {
   274  		if roles[0] == string(RolePrimary) {
   275  			if reqepoch, err := strconv.Atoi(epochs[0]); err == nil {
   276  				if reqepoch == epoch && role == RolePrimary {
   277  					// Misconfiguration in the cluster means this
   278  					// server and its standby are marked as Primary
   279  					// at the same epoch. We will become standby
   280  					// and our peer will become standby. An
   281  					// operator will need to get involved.
   282  					si.lgr.Errorf("cluster: serverinterceptor: this server and its standby replica are both primary at the same epoch. force transitioning to detected_broken_config.")
   283  					si.roleSetter(string(RoleDetectedBrokenConfig), reqepoch)
   284  				} else if reqepoch > epoch {
   285  					if role == RolePrimary {
   286  						// The client replicating to us thinks it is the primary at a higher epoch than us.
   287  						si.lgr.Warnf("cluster: serverinterceptor: this server is primary at epoch %d. the server replicating to it is primary at epoch %d. force transitioning to standby.", epoch, reqepoch)
   288  					} else if role == RoleDetectedBrokenConfig {
   289  						si.lgr.Warnf("cluster: serverinterceptor: this server is detected_broken_config at epoch %d. the server replicating to it is primary at epoch %d. transitioning to standby.", epoch, reqepoch)
   290  					}
   291  					si.roleSetter(string(RoleStandby), reqepoch)
   292  				}
   293  			}
   294  		}
   295  		// returns true if the request was from a cluster replica, false otherwise
   296  		return true
   297  	}
   298  	return false
   299  }
   300  
   301  func (si *serverinterceptor) Options() []grpc.ServerOption {
   302  	return []grpc.ServerOption{
   303  		grpc.ChainUnaryInterceptor(si.Unary()),
   304  		grpc.ChainStreamInterceptor(si.Stream()),
   305  	}
   306  }
   307  
   308  func (si *serverinterceptor) setRole(role Role, epoch int) {
   309  	si.mu.Lock()
   310  	defer si.mu.Unlock()
   311  	si.role = role
   312  	si.epoch = epoch
   313  }
   314  
   315  func (si *serverinterceptor) getRole() (Role, int) {
   316  	si.mu.Lock()
   317  	defer si.mu.Unlock()
   318  	return si.role, si.epoch
   319  }
   320  
   321  func (si *serverinterceptor) authenticate(ctx context.Context) error {
   322  	if md, ok := metadata.FromIncomingContext(ctx); ok {
   323  		auths := md.Get("authorization")
   324  		if len(auths) != 1 {
   325  			si.lgr.Info("incoming standby request had no authorization")
   326  			return status.Error(codes.Unauthenticated, "unauthenticated")
   327  		}
   328  		auth := auths[0]
   329  		if !strings.HasPrefix(auth, "Bearer ") {
   330  			si.lgr.Info("incoming standby request had malformed authentication header")
   331  			return status.Error(codes.Unauthenticated, "unauthenticated")
   332  		}
   333  		auth = strings.TrimPrefix(auth, "Bearer ")
   334  		_, err := jwtauth.ValidateJWT(auth, time.Now(), si.keyProvider, si.jwtExpected)
   335  		if err != nil {
   336  			si.lgr.Infof("incoming standby request authorization header failed to verify: %v", err)
   337  			return status.Error(codes.Unauthenticated, "unauthenticated")
   338  		}
   339  		return nil
   340  	}
   341  	return status.Error(codes.Unauthenticated, "unauthenticated")
   342  }