github.imxd.top/hashicorp/consul@v1.4.5/agent/xds/server.go (about) 1 package xds 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "log" 8 "sync/atomic" 9 "time" 10 11 "google.golang.org/grpc" 12 "google.golang.org/grpc/codes" 13 "google.golang.org/grpc/credentials" 14 "google.golang.org/grpc/metadata" 15 "google.golang.org/grpc/status" 16 17 envoy "github.com/envoyproxy/go-control-plane/envoy/api/v2" 18 envoyauthz "github.com/envoyproxy/go-control-plane/envoy/service/auth/v2alpha" 19 envoydisco "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v2" 20 "github.com/gogo/googleapis/google/rpc" 21 "github.com/gogo/protobuf/proto" 22 "github.com/hashicorp/consul/acl" 23 "github.com/hashicorp/consul/agent/cache" 24 "github.com/hashicorp/consul/agent/connect" 25 "github.com/hashicorp/consul/agent/proxycfg" 26 "github.com/hashicorp/consul/agent/structs" 27 ) 28 29 // ADSStream is a shorter way of referring to this thing... 30 type ADSStream = envoydisco.AggregatedDiscoveryService_StreamAggregatedResourcesServer 31 32 const ( 33 // Resource types in xDS v2. These are copied from 34 // envoyproxy/go-control-plane/pkg/cache/resource.go since we don't need any of 35 // the rest of that package. 36 typePrefix = "type.googleapis.com/envoy.api.v2." 37 38 // EndpointType is the TypeURL for Endpoint discovery responses. 39 EndpointType = typePrefix + "ClusterLoadAssignment" 40 41 // ClusterType is the TypeURL for Cluster discovery responses. 42 ClusterType = typePrefix + "Cluster" 43 44 // RouteType is the TypeURL for Route discovery responses. 45 RouteType = typePrefix + "RouteConfiguration" 46 47 // ListenerType is the TypeURL for Listener discovery responses. 48 ListenerType = typePrefix + "Listener" 49 50 // PublicListenerName is the name we give the public listener in Envoy config. 51 PublicListenerName = "public_listener" 52 53 // LocalAppClusterName is the name we give the local application "cluster" in 54 // Envoy config. 55 LocalAppClusterName = "local_app" 56 57 // LocalAgentClusterName is the name we give the local agent "cluster" in 58 // Envoy config. 59 LocalAgentClusterName = "local_agent" 60 61 // DefaultAuthCheckFrequency is the default value for 62 // Server.AuthCheckFrequency to use when the zero value is provided. 63 DefaultAuthCheckFrequency = 5 * time.Minute 64 ) 65 66 // ACLResolverFunc is a shim to resolve ACLs. Since ACL enforcement is so far 67 // entirely agent-local and all uses private methods this allows a simple shim 68 // to be written in the agent package to allow resolving without tightly 69 // coupling this to the agent. 70 type ACLResolverFunc func(id string) (acl.Authorizer, error) 71 72 // ConnectAuthz is the interface the agent needs to expose to be able to re-use 73 // the authorization logic between both APIs. 74 type ConnectAuthz interface { 75 // ConnectAuthorize is implemented by Agent.ConnectAuthorize 76 ConnectAuthorize(token string, req *structs.ConnectAuthorizeRequest) (authz bool, reason string, m *cache.ResultMeta, err error) 77 } 78 79 // ConfigManager is the interface xds.Server requires to consume proxy config 80 // updates. It's satisfied normally by the agent's proxycfg.Manager, but allows 81 // easier testing without several layers of mocked cache, local state and 82 // proxycfg.Manager. 83 type ConfigManager interface { 84 Watch(proxyID string) (<-chan *proxycfg.ConfigSnapshot, proxycfg.CancelFunc) 85 } 86 87 // Server represents a gRPC server that can handle both XDS and ext_authz 88 // requests from Envoy. All of it's public members must be set before the gRPC 89 // server is started. 90 // 91 // A full description of the XDS protocol can be found at 92 // https://github.com/envoyproxy/data-plane-api/blob/master/XDS_PROTOCOL.md 93 type Server struct { 94 Logger *log.Logger 95 CfgMgr ConfigManager 96 Authz ConnectAuthz 97 ResolveToken ACLResolverFunc 98 // AuthCheckFrequency is how often we should re-check the credentials used 99 // during a long-lived gRPC Stream after it has been initially established. 100 // This is only used during idle periods of stream interactions (i.e. when 101 // there has been no recent DiscoveryRequest). 102 AuthCheckFrequency time.Duration 103 } 104 105 // Initialize will finish configuring the Server for first use. 106 func (s *Server) Initialize() { 107 if s.AuthCheckFrequency == 0 { 108 s.AuthCheckFrequency = DefaultAuthCheckFrequency 109 } 110 } 111 112 // StreamAggregatedResources implements 113 // envoydisco.AggregatedDiscoveryServiceServer. This is the ADS endpoint which is 114 // the only xDS API we directly support for now. 115 func (s *Server) StreamAggregatedResources(stream ADSStream) error { 116 // a channel for receiving incoming requests 117 reqCh := make(chan *envoy.DiscoveryRequest) 118 reqStop := int32(0) 119 go func() { 120 for { 121 req, err := stream.Recv() 122 if atomic.LoadInt32(&reqStop) != 0 { 123 return 124 } 125 if err != nil { 126 close(reqCh) 127 return 128 } 129 reqCh <- req 130 } 131 }() 132 133 err := s.process(stream, reqCh) 134 if err != nil { 135 s.Logger.Printf("[DEBUG] Error handling ADS stream: %s", err) 136 } 137 138 // prevents writing to a closed channel if send failed on blocked recv 139 atomic.StoreInt32(&reqStop, 1) 140 141 return err 142 } 143 144 const ( 145 stateInit int = iota 146 statePendingInitialConfig 147 stateRunning 148 ) 149 150 func (s *Server) process(stream ADSStream, reqCh <-chan *envoy.DiscoveryRequest) error { 151 // xDS requires a unique nonce to correlate response/request pairs 152 var nonce uint64 153 154 // xDS works with versions of configs. Internally we don't have a consistent 155 // version. We could just hash the config since versions don't have to be 156 // ordered as far as I can tell, but it's cheaper just to increment a counter 157 // every time we observe a new config since the upstream proxycfg package only 158 // delivers updates when there are actual changes. 159 var configVersion uint64 160 161 // Loop state 162 var cfgSnap *proxycfg.ConfigSnapshot 163 var req *envoy.DiscoveryRequest 164 var ok bool 165 var stateCh <-chan *proxycfg.ConfigSnapshot 166 var watchCancel func() 167 var proxyID string 168 169 // need to run a small state machine to get through initial authentication. 170 var state = stateInit 171 172 // Configure handlers for each type of request 173 handlers := map[string]*xDSType{ 174 EndpointType: &xDSType{ 175 typeURL: EndpointType, 176 resources: endpointsFromSnapshot, 177 stream: stream, 178 }, 179 ClusterType: &xDSType{ 180 typeURL: ClusterType, 181 resources: clustersFromSnapshot, 182 stream: stream, 183 }, 184 RouteType: &xDSType{ 185 typeURL: RouteType, 186 resources: routesFromSnapshot, 187 stream: stream, 188 }, 189 ListenerType: &xDSType{ 190 typeURL: ListenerType, 191 resources: listenersFromSnapshot, 192 stream: stream, 193 }, 194 } 195 196 var authTimer <-chan time.Time 197 extendAuthTimer := func() { 198 authTimer = time.After(s.AuthCheckFrequency) 199 } 200 201 checkStreamACLs := func(cfgSnap *proxycfg.ConfigSnapshot) error { 202 if cfgSnap == nil { 203 return status.Errorf(codes.Unauthenticated, "unauthenticated: no config snapshot") 204 } 205 206 token := tokenFromStream(stream) 207 rule, err := s.ResolveToken(token) 208 209 if acl.IsErrNotFound(err) { 210 return status.Errorf(codes.Unauthenticated, "unauthenticated: %v", err) 211 } else if acl.IsErrPermissionDenied(err) { 212 return status.Errorf(codes.PermissionDenied, "permission denied: %v", err) 213 } else if err != nil { 214 return err 215 } 216 217 if rule != nil && !rule.ServiceWrite(cfgSnap.Proxy.DestinationServiceName, nil) { 218 return status.Errorf(codes.PermissionDenied, "permission denied") 219 } 220 221 // Authed OK! 222 return nil 223 } 224 225 for { 226 select { 227 case <-authTimer: 228 // It's been too long since a Discovery{Request,Response} so recheck ACLs. 229 if err := checkStreamACLs(cfgSnap); err != nil { 230 return err 231 } 232 extendAuthTimer() 233 234 case req, ok = <-reqCh: 235 if !ok { 236 // reqCh is closed when stream.Recv errors which is how we detect client 237 // going away. AFAICT the stream.Context() is only canceled once the 238 // RPC method returns which it can't until we return from this one so 239 // there's no point in blocking on that. 240 return nil 241 } 242 if req.TypeUrl == "" { 243 return status.Errorf(codes.InvalidArgument, "type URL is required for ADS") 244 } 245 if handler, ok := handlers[req.TypeUrl]; ok { 246 handler.Recv(req) 247 } 248 case cfgSnap = <-stateCh: 249 // We got a new config, update the version counter 250 configVersion++ 251 } 252 253 // Trigger state machine 254 switch state { 255 case stateInit: 256 if req == nil { 257 // This can't happen (tm) since stateCh is nil until after the first req 258 // is received but lets not panic about it. 259 continue 260 } 261 // Start authentication process, we need the proxyID 262 proxyID = req.Node.Id 263 264 // Start watching config for that proxy 265 stateCh, watchCancel = s.CfgMgr.Watch(proxyID) 266 // Note that in this case we _intend_ the defer to only be triggered when 267 // this whole process method ends (i.e. when streaming RPC aborts) not at 268 // the end of the current loop iteration. We have to do it in the loop 269 // here since we can't start watching until we get to this state in the 270 // state machine. 271 defer watchCancel() 272 273 // Now wait for the config so we can check ACL 274 state = statePendingInitialConfig 275 case statePendingInitialConfig: 276 if cfgSnap == nil { 277 // Nothing we can do until we get the initial config 278 continue 279 } 280 281 // Got config, try to authenticate next. 282 state = stateRunning 283 284 // Lets actually process the config we just got or we'll mis responding 285 fallthrough 286 case stateRunning: 287 // Check ACLs on every Discovery{Request,Response}. 288 if err := checkStreamACLs(cfgSnap); err != nil { 289 return err 290 } 291 // For the first time through the state machine, this is when the 292 // timer is first started. 293 extendAuthTimer() 294 295 // See if any handlers need to have the current (possibly new) config 296 // sent. Note the order here is actually significant so we can't just 297 // range the map which has no determined order. It's important because: 298 // 299 // 1. Envoy needs to see a consistent snapshot to avoid potentially 300 // dropping traffic due to inconsistencies. This is the 301 // main win of ADS after all - we get to control this order. 302 // 2. Non-determinsic order of complex protobuf responses which are 303 // compared for non-exact JSON equivalence makes the tests uber-messy 304 // to handle 305 for _, typeURL := range []string{ClusterType, EndpointType, RouteType, ListenerType} { 306 handler := handlers[typeURL] 307 if err := handler.SendIfNew(cfgSnap, configVersion, &nonce); err != nil { 308 return err 309 } 310 } 311 } 312 } 313 } 314 315 type xDSType struct { 316 typeURL string 317 stream ADSStream 318 req *envoy.DiscoveryRequest 319 lastNonce string 320 // lastVersion is the version that was last sent to the proxy. It is needed 321 // because we don't want to send the same version more than once. 322 // req.VersionInfo may be an older version than the most recent once sent in 323 // two cases: 1) if the ACK wasn't received yet and `req` still points to the 324 // previous request we already responded to and 2) if the proxy rejected the 325 // last version we sent with a Nack then req.VersionInfo will be the older 326 // version it's hanging on to. 327 lastVersion uint64 328 resources func(cfgSnap *proxycfg.ConfigSnapshot, token string) ([]proto.Message, error) 329 } 330 331 func (t *xDSType) Recv(req *envoy.DiscoveryRequest) { 332 if t.lastNonce == "" || t.lastNonce == req.GetResponseNonce() { 333 t.req = req 334 } 335 } 336 337 func (t *xDSType) SendIfNew(cfgSnap *proxycfg.ConfigSnapshot, version uint64, nonce *uint64) error { 338 if t.req == nil { 339 return nil 340 } 341 if t.lastVersion >= version { 342 // Already sent this version 343 return nil 344 } 345 resources, err := t.resources(cfgSnap, tokenFromStream(t.stream)) 346 if err != nil { 347 return err 348 } 349 if resources == nil || len(resources) == 0 { 350 // Nothing to send yet 351 return nil 352 } 353 354 // Note we only increment nonce when we actually send - not important for 355 // correctness but makes tests much simpler when we skip a type like Routes 356 // with nothing to send. 357 *nonce++ 358 nonceStr := fmt.Sprintf("%08x", *nonce) 359 versionStr := fmt.Sprintf("%08x", version) 360 361 resp, err := createResponse(t.typeURL, versionStr, nonceStr, resources) 362 if err != nil { 363 return err 364 } 365 366 err = t.stream.Send(resp) 367 if err != nil { 368 return err 369 } 370 t.lastVersion = version 371 t.lastNonce = nonceStr 372 return nil 373 } 374 375 func tokenFromStream(stream ADSStream) string { 376 return tokenFromContext(stream.Context()) 377 } 378 379 func tokenFromContext(ctx context.Context) string { 380 md, ok := metadata.FromIncomingContext(ctx) 381 if !ok { 382 return "" 383 } 384 toks, ok := md["x-consul-token"] 385 if ok && len(toks) > 0 { 386 return toks[0] 387 } 388 return "" 389 } 390 391 // IncrementalAggregatedResources implements envoydisco.AggregatedDiscoveryServiceServer 392 func (s *Server) IncrementalAggregatedResources(_ envoydisco.AggregatedDiscoveryService_IncrementalAggregatedResourcesServer) error { 393 return errors.New("not implemented") 394 } 395 396 func deniedResponse(reason string) (*envoyauthz.CheckResponse, error) { 397 return &envoyauthz.CheckResponse{ 398 Status: &rpc.Status{ 399 Code: int32(rpc.PERMISSION_DENIED), 400 Message: "Denied: " + reason, 401 }, 402 }, nil 403 } 404 405 // Check implements envoyauthz.AuthorizationServer. 406 func (s *Server) Check(ctx context.Context, r *envoyauthz.CheckRequest) (*envoyauthz.CheckResponse, error) { 407 // Sanity checks 408 if r.Attributes == nil || r.Attributes.Source == nil || r.Attributes.Destination == nil { 409 return nil, status.Error(codes.InvalidArgument, "source and destination attributes are required") 410 } 411 if r.Attributes.Source.Principal == "" || r.Attributes.Destination.Principal == "" { 412 return nil, status.Error(codes.InvalidArgument, "source and destination Principal are required") 413 } 414 415 // Parse destination to know the target service 416 dest, err := connect.ParseCertURIFromString(r.Attributes.Destination.Principal) 417 if err != nil { 418 // Treat this as an auth error since Envoy has sent something it considers 419 // valid, it's just not an identity we trust. 420 return deniedResponse("Destination Principal is not a valid Connect identity") 421 } 422 423 destID, ok := dest.(*connect.SpiffeIDService) 424 if !ok { 425 return deniedResponse("Destination Principal is not a valid Service identity") 426 } 427 428 // For now we don't validate the trust domain of the _destination_ at all - 429 // the HTTP Authorize endpoint just accepts a target _service_ and it's 430 // implicit that the request is for the correct cluster. We might want to 431 // reconsider this later but plumbing in additional machinery to check the 432 // clusterID here is not really necessary for now unless Envoys are badly 433 // configured. Our threat model _requires_ correctly configured and well 434 // behaved proxies given that they have ACLs to fetch certs and so can do 435 // whatever they want including not authorizing traffic at all or routing it 436 // do a different service than they auth'd against. 437 438 // Create an authz request 439 req := &structs.ConnectAuthorizeRequest{ 440 Target: destID.Service, 441 ClientCertURI: r.Attributes.Source.Principal, 442 // TODO(banks): need Envoy to support sending cert serial/hash to enforce 443 // revocation later. 444 } 445 token := tokenFromContext(ctx) 446 authed, reason, _, err := s.Authz.ConnectAuthorize(token, req) 447 if err != nil { 448 if err == acl.ErrPermissionDenied { 449 return nil, status.Error(codes.PermissionDenied, err.Error()) 450 } 451 return nil, status.Error(codes.Internal, err.Error()) 452 } 453 if !authed { 454 return deniedResponse(reason) 455 } 456 457 return &envoyauthz.CheckResponse{ 458 Status: &rpc.Status{ 459 Code: int32(rpc.OK), 460 Message: "ALLOWED: " + reason, 461 }, 462 }, nil 463 } 464 465 // GRPCServer returns a server instance that can handle XDS and ext_authz 466 // requests. 467 func (s *Server) GRPCServer(certFile, keyFile string) (*grpc.Server, error) { 468 opts := []grpc.ServerOption{ 469 grpc.MaxConcurrentStreams(2048), 470 } 471 if certFile != "" && keyFile != "" { 472 creds, err := credentials.NewServerTLSFromFile(certFile, keyFile) 473 if err != nil { 474 return nil, err 475 } 476 opts = append(opts, grpc.Creds(creds)) 477 } 478 srv := grpc.NewServer(opts...) 479 envoydisco.RegisterAggregatedDiscoveryServiceServer(srv, s) 480 envoyauthz.RegisterAuthorizationServer(srv, s) 481 return srv, nil 482 }