github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/cluster/interceptors.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cluster 16 17 import ( 18 "context" 19 "strconv" 20 "strings" 21 "sync" 22 "time" 23 24 "github.com/sirupsen/logrus" 25 "google.golang.org/grpc" 26 "google.golang.org/grpc/codes" 27 "google.golang.org/grpc/metadata" 28 "google.golang.org/grpc/status" 29 "gopkg.in/square/go-jose.v2/jwt" 30 31 "github.com/dolthub/dolt/go/libraries/utils/jwtauth" 32 ) 33 34 const clusterRoleHeader = "x-dolt-cluster-role" 35 const clusterRoleEpochHeader = "x-dolt-cluster-role-epoch" 36 37 var writeEndpoints map[string]bool 38 39 func init() { 40 writeEndpoints = make(map[string]bool) 41 writeEndpoints["/dolt.services.remotesapi.v1alpha1.ChunkStoreService/Commit"] = true 42 writeEndpoints["/dolt.services.remotesapi.v1alpha1.ChunkStoreService/AddTableFiles"] = true 43 writeEndpoints["/dolt.services.remotesapi.v1alpha1.ChunkStoreService/GetUploadLocations"] = true 44 } 45 46 func isLikelyServerResponse(err error) bool { 47 code := status.Code(err) 48 switch code { 49 case codes.Unavailable: 50 fallthrough 51 case codes.DeadlineExceeded: 52 fallthrough 53 case codes.Canceled: 54 return false 55 default: 56 return true 57 } 58 } 59 60 // clientinterceptor is installed as a Unary and Stream client interceptor on 61 // the client conns that are used to communicate with standby remotes. The 62 // cluster.Controller sets this server's current Role and role epoch on the 63 // interceptor anytime it changes. In turn, this interceptor: 64 // * adds the server's current role and epoch to the request headers for every 65 // outbound request. 66 // * fails all outgoing requests immediately with codes.FailedPrecondition if 67 // the role == RoleStandby, since this server should not be replicating when it 68 // believes it is a standby. 69 // * watches returned response headers for a situation which causes this server 70 // to force downgrade from primary to standby. In particular, when a returned 71 // response header asserts that the standby replica is a primary at a higher 72 // epoch than this server, this incterceptor coordinates with the Controller to 73 // immediately transition to standby and to stop replicating to the standby. 74 type clientinterceptor struct { 75 lgr *logrus.Entry 76 role Role 77 epoch int 78 mu sync.Mutex 79 roleSetter func(role string, epoch int) 80 } 81 82 func (ci *clientinterceptor) setRole(role Role, epoch int) { 83 ci.mu.Lock() 84 defer ci.mu.Unlock() 85 ci.role = role 86 ci.epoch = epoch 87 } 88 89 func (ci *clientinterceptor) getRole() (Role, int) { 90 ci.mu.Lock() 91 defer ci.mu.Unlock() 92 return ci.role, ci.epoch 93 } 94 95 func (ci *clientinterceptor) Stream() grpc.StreamClientInterceptor { 96 return func(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) { 97 role, epoch := ci.getRole() 98 ci.lgr.Tracef("cluster: clientinterceptor: processing request to %s, role %s", method, string(role)) 99 if role == RoleStandby { 100 return nil, status.Error(codes.FailedPrecondition, "cluster: clientinterceptor: this server is a standby and is not currently replicating to its standby") 101 } 102 if role == RoleDetectedBrokenConfig { 103 return nil, status.Error(codes.FailedPrecondition, "cluster: clientinterceptor: this server is in detected_broken_config and is not currently replicating to its standby") 104 } 105 ctx = metadata.AppendToOutgoingContext(ctx, clusterRoleHeader, string(role), clusterRoleEpochHeader, strconv.Itoa(epoch)) 106 var header metadata.MD 107 stream, err := streamer(ctx, desc, cc, method, append(opts, grpc.Header(&header))...) 108 ci.handleResponseHeaders(header, err) 109 return stream, err 110 } 111 } 112 113 func (ci *clientinterceptor) Unary() grpc.UnaryClientInterceptor { 114 return func(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error { 115 role, epoch := ci.getRole() 116 ci.lgr.Tracef("cluster: clientinterceptor: processing request to %s, role %s", method, string(role)) 117 if role == RoleStandby { 118 return status.Error(codes.FailedPrecondition, "cluster: clientinterceptor: this server is a standby and is not currently replicating to its standby") 119 } 120 if role == RoleDetectedBrokenConfig { 121 return status.Error(codes.FailedPrecondition, "cluster: clientinterceptor: this server is in detected_broken_config and is not currently replicating to its standby") 122 } 123 ctx = metadata.AppendToOutgoingContext(ctx, clusterRoleHeader, string(role), clusterRoleEpochHeader, strconv.Itoa(epoch)) 124 var header metadata.MD 125 err := invoker(ctx, method, req, reply, cc, append(opts, grpc.Header(&header))...) 126 ci.handleResponseHeaders(header, err) 127 return err 128 } 129 } 130 131 func (ci *clientinterceptor) handleResponseHeaders(header metadata.MD, err error) { 132 role, epoch := ci.getRole() 133 if role != RolePrimary { 134 // By the time we process this response, we were no longer a primary. 135 return 136 } 137 respEpochs := header.Get(clusterRoleEpochHeader) 138 respRoles := header.Get(clusterRoleHeader) 139 if len(respEpochs) > 0 && len(respRoles) > 0 { 140 respRole := respRoles[0] 141 respEpoch, err := strconv.Atoi(respEpochs[0]) 142 if err == nil { 143 if respRole == string(RolePrimary) { 144 if respEpoch == epoch { 145 ci.lgr.Errorf("cluster: clientinterceptor: this server and the server replicating to it are both primary at the same epoch. force transitioning to detected_broken_config.") 146 ci.roleSetter(string(RoleDetectedBrokenConfig), respEpoch) 147 } else if respEpoch > epoch { 148 // The server we replicate to thinks it is the primary at a higher epoch than us... 149 ci.lgr.Warnf("cluster: clientinterceptor: this server is primary at epoch %d. a server it attempted to replicate to is primary at epoch %d. force transitioning to standby.", epoch, respEpoch) 150 ci.roleSetter(string(RoleStandby), respEpoch) 151 } 152 } else if respRole == string(RoleDetectedBrokenConfig) && respEpoch >= epoch { 153 ci.lgr.Errorf("cluster: clientinterceptor: this server learned from its standby that the standby is in detected_broken_config at the same or higher epoch. force transitioning to detected_broken_config.") 154 ci.roleSetter(string(RoleDetectedBrokenConfig), respEpoch) 155 } 156 } else { 157 ci.lgr.Errorf("cluster: clientinterceptor: failed to parse epoch in response header; something is wrong: %v", err) 158 } 159 } else if isLikelyServerResponse(err) { 160 ci.lgr.Warnf("cluster: clientinterceptor: response was missing role and epoch metadata") 161 } 162 } 163 164 func (ci *clientinterceptor) Options() []grpc.DialOption { 165 return []grpc.DialOption{ 166 grpc.WithChainUnaryInterceptor(ci.Unary()), 167 grpc.WithChainStreamInterceptor(ci.Stream()), 168 } 169 } 170 171 // serverinterceptor is installed as a Unary and Stream interceptor on a 172 // ChunkStoreServer which is serving a SQL database as a standby remote. The 173 // cluster.Controller sets this server's current Role and role epoch on the 174 // interceptor anytime it changes. In turn, this interceptor has the following 175 // behavior: 176 // * for any incoming standby traffic, it will add the server's current role 177 // and epoch to the response headers for every request. 178 // * for any incoming standby traffic, it will fail incoming requests 179 // immediately with codes.FailedPrecondition if the current role != 180 // RoleStandby, since nothing should be replicating to us in that state. 181 // * watches incoming request headers for a situation which causes this server 182 // to force downgrade from primary to standby. In particular, when an incoming 183 // request asserts that the client is the current primary at an epoch higher 184 // than our current epoch, this interceptor coordinates with the Controller to 185 // immediately transition to standby and allow replication requests through. 186 // * for incoming requests which are not standby, it will currently fail the 187 // requests with codes.Unauthenticated. Eventually, it will allow read-only 188 // traffic through which is authenticated and authorized. 189 // 190 // The serverinterceptor is responsible for authenticating incoming requests 191 // from standby replicas. It is instantiated with a jwtauth.KeyProvider and 192 // some jwt.Expected. Incoming requests must have a valid, unexpired, signed 193 // JWT, signed by a key accessible in the KeyProvider. 194 type serverinterceptor struct { 195 lgr *logrus.Entry 196 role Role 197 epoch int 198 mu sync.Mutex 199 roleSetter func(role string, epoch int) 200 201 keyProvider jwtauth.KeyProvider 202 jwtExpected jwt.Expected 203 } 204 205 func (si *serverinterceptor) Stream() grpc.StreamServerInterceptor { 206 return func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { 207 fromClusterMember := false 208 if md, ok := metadata.FromIncomingContext(ss.Context()); ok { 209 fromClusterMember = si.handleRequestHeaders(md) 210 } 211 if fromClusterMember { 212 if err := si.authenticate(ss.Context()); err != nil { 213 return err 214 } 215 // After handleRequestHeaders, our role may have changed, so we fetch it again here. 216 role, epoch := si.getRole() 217 if err := grpc.SetHeader(ss.Context(), metadata.Pairs(clusterRoleHeader, string(role), clusterRoleEpochHeader, strconv.Itoa(epoch))); err != nil { 218 return err 219 } 220 if role == RolePrimary { 221 // As a primary, we do not accept replication requests. 222 return status.Error(codes.FailedPrecondition, "this server is a primary and is not currently accepting replication") 223 } 224 if role == RoleDetectedBrokenConfig { 225 // In detected_brokne_config we do not accept replication requests. 226 return status.Error(codes.FailedPrecondition, "this server is currently in detected_broken_config and is not currently accepting replication") 227 } 228 return handler(srv, ss) 229 } else if isWrite := writeEndpoints[info.FullMethod]; isWrite { 230 return status.Error(codes.Unimplemented, "unimplemented") 231 } else { 232 return status.Error(codes.Unauthenticated, "unauthenticated") 233 } 234 } 235 } 236 237 func (si *serverinterceptor) Unary() grpc.UnaryServerInterceptor { 238 return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { 239 fromClusterMember := false 240 if md, ok := metadata.FromIncomingContext(ctx); ok { 241 fromClusterMember = si.handleRequestHeaders(md) 242 } 243 if fromClusterMember { 244 if err := si.authenticate(ctx); err != nil { 245 return nil, err 246 } 247 // After handleRequestHeaders, our role may have changed, so we fetch it again here. 248 role, epoch := si.getRole() 249 if err := grpc.SetHeader(ctx, metadata.Pairs(clusterRoleHeader, string(role), clusterRoleEpochHeader, strconv.Itoa(epoch))); err != nil { 250 return nil, err 251 } 252 if role == RolePrimary { 253 // As a primary, we do not accept replication requests. 254 return nil, status.Error(codes.FailedPrecondition, "this server is a primary and is not currently accepting replication") 255 } 256 if role == RoleDetectedBrokenConfig { 257 // In detected_broken_config we do not accept replication requests. 258 return nil, status.Error(codes.FailedPrecondition, "this server is currently in detected_broken_config and is not currently accepting replication") 259 } 260 return handler(ctx, req) 261 } else if isWrite := writeEndpoints[info.FullMethod]; isWrite { 262 return nil, status.Error(codes.Unimplemented, "unimplemented") 263 } else { 264 return nil, status.Error(codes.Unauthenticated, "unauthenticated") 265 } 266 } 267 } 268 269 func (si *serverinterceptor) handleRequestHeaders(header metadata.MD) bool { 270 role, epoch := si.getRole() 271 epochs := header.Get(clusterRoleEpochHeader) 272 roles := header.Get(clusterRoleHeader) 273 if len(epochs) > 0 && len(roles) > 0 { 274 if roles[0] == string(RolePrimary) { 275 if reqepoch, err := strconv.Atoi(epochs[0]); err == nil { 276 if reqepoch == epoch && role == RolePrimary { 277 // Misconfiguration in the cluster means this 278 // server and its standby are marked as Primary 279 // at the same epoch. We will become standby 280 // and our peer will become standby. An 281 // operator will need to get involved. 282 si.lgr.Errorf("cluster: serverinterceptor: this server and its standby replica are both primary at the same epoch. force transitioning to detected_broken_config.") 283 si.roleSetter(string(RoleDetectedBrokenConfig), reqepoch) 284 } else if reqepoch > epoch { 285 if role == RolePrimary { 286 // The client replicating to us thinks it is the primary at a higher epoch than us. 287 si.lgr.Warnf("cluster: serverinterceptor: this server is primary at epoch %d. the server replicating to it is primary at epoch %d. force transitioning to standby.", epoch, reqepoch) 288 } else if role == RoleDetectedBrokenConfig { 289 si.lgr.Warnf("cluster: serverinterceptor: this server is detected_broken_config at epoch %d. the server replicating to it is primary at epoch %d. transitioning to standby.", epoch, reqepoch) 290 } 291 si.roleSetter(string(RoleStandby), reqepoch) 292 } 293 } 294 } 295 // returns true if the request was from a cluster replica, false otherwise 296 return true 297 } 298 return false 299 } 300 301 func (si *serverinterceptor) Options() []grpc.ServerOption { 302 return []grpc.ServerOption{ 303 grpc.ChainUnaryInterceptor(si.Unary()), 304 grpc.ChainStreamInterceptor(si.Stream()), 305 } 306 } 307 308 func (si *serverinterceptor) setRole(role Role, epoch int) { 309 si.mu.Lock() 310 defer si.mu.Unlock() 311 si.role = role 312 si.epoch = epoch 313 } 314 315 func (si *serverinterceptor) getRole() (Role, int) { 316 si.mu.Lock() 317 defer si.mu.Unlock() 318 return si.role, si.epoch 319 } 320 321 func (si *serverinterceptor) authenticate(ctx context.Context) error { 322 if md, ok := metadata.FromIncomingContext(ctx); ok { 323 auths := md.Get("authorization") 324 if len(auths) != 1 { 325 si.lgr.Info("incoming standby request had no authorization") 326 return status.Error(codes.Unauthenticated, "unauthenticated") 327 } 328 auth := auths[0] 329 if !strings.HasPrefix(auth, "Bearer ") { 330 si.lgr.Info("incoming standby request had malformed authentication header") 331 return status.Error(codes.Unauthenticated, "unauthenticated") 332 } 333 auth = strings.TrimPrefix(auth, "Bearer ") 334 _, err := jwtauth.ValidateJWT(auth, time.Now(), si.keyProvider, si.jwtExpected) 335 if err != nil { 336 si.lgr.Infof("incoming standby request authorization header failed to verify: %v", err) 337 return status.Error(codes.Unauthenticated, "unauthenticated") 338 } 339 return nil 340 } 341 return status.Error(codes.Unauthenticated, "unauthenticated") 342 }