github.com/kjdelisle/consul@v1.4.5/agent/xds/server.go (about)

     1  package xds
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"log"
     8  	"sync/atomic"
     9  	"time"
    10  
    11  	"google.golang.org/grpc"
    12  	"google.golang.org/grpc/codes"
    13  	"google.golang.org/grpc/credentials"
    14  	"google.golang.org/grpc/metadata"
    15  	"google.golang.org/grpc/status"
    16  
    17  	envoy "github.com/envoyproxy/go-control-plane/envoy/api/v2"
    18  	envoyauthz "github.com/envoyproxy/go-control-plane/envoy/service/auth/v2alpha"
    19  	envoydisco "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v2"
    20  	"github.com/gogo/googleapis/google/rpc"
    21  	"github.com/gogo/protobuf/proto"
    22  	"github.com/hashicorp/consul/acl"
    23  	"github.com/hashicorp/consul/agent/cache"
    24  	"github.com/hashicorp/consul/agent/connect"
    25  	"github.com/hashicorp/consul/agent/proxycfg"
    26  	"github.com/hashicorp/consul/agent/structs"
    27  )
    28  
    29  // ADSStream is a shorter way of referring to this thing...
    30  type ADSStream = envoydisco.AggregatedDiscoveryService_StreamAggregatedResourcesServer
    31  
    32  const (
    33  	// Resource types in xDS v2. These are copied from
    34  	// envoyproxy/go-control-plane/pkg/cache/resource.go since we don't need any of
    35  	// the rest of that package.
    36  	typePrefix = "type.googleapis.com/envoy.api.v2."
    37  
    38  	// EndpointType is the TypeURL for Endpoint discovery responses.
    39  	EndpointType = typePrefix + "ClusterLoadAssignment"
    40  
    41  	// ClusterType is the TypeURL for Cluster discovery responses.
    42  	ClusterType = typePrefix + "Cluster"
    43  
    44  	// RouteType is the TypeURL for Route discovery responses.
    45  	RouteType = typePrefix + "RouteConfiguration"
    46  
    47  	// ListenerType is the TypeURL for Listener discovery responses.
    48  	ListenerType = typePrefix + "Listener"
    49  
    50  	// PublicListenerName is the name we give the public listener in Envoy config.
    51  	PublicListenerName = "public_listener"
    52  
    53  	// LocalAppClusterName is the name we give the local application "cluster" in
    54  	// Envoy config.
    55  	LocalAppClusterName = "local_app"
    56  
    57  	// LocalAgentClusterName is the name we give the local agent "cluster" in
    58  	// Envoy config.
    59  	LocalAgentClusterName = "local_agent"
    60  
    61  	// DefaultAuthCheckFrequency is the default value for
    62  	// Server.AuthCheckFrequency to use when the zero value is provided.
    63  	DefaultAuthCheckFrequency = 5 * time.Minute
    64  )
    65  
    66  // ACLResolverFunc is a shim to resolve ACLs. Since ACL enforcement is so far
    67  // entirely agent-local and all uses private methods this allows a simple shim
    68  // to be written in the agent package to allow resolving without tightly
    69  // coupling this to the agent.
    70  type ACLResolverFunc func(id string) (acl.Authorizer, error)
    71  
    72  // ConnectAuthz is the interface the agent needs to expose to be able to re-use
    73  // the authorization logic between both APIs.
    74  type ConnectAuthz interface {
    75  	// ConnectAuthorize is implemented by Agent.ConnectAuthorize
    76  	ConnectAuthorize(token string, req *structs.ConnectAuthorizeRequest) (authz bool, reason string, m *cache.ResultMeta, err error)
    77  }
    78  
    79  // ConfigManager is the interface xds.Server requires to consume proxy config
    80  // updates. It's satisfied normally by the agent's proxycfg.Manager, but allows
    81  // easier testing without several layers of mocked cache, local state and
    82  // proxycfg.Manager.
    83  type ConfigManager interface {
    84  	Watch(proxyID string) (<-chan *proxycfg.ConfigSnapshot, proxycfg.CancelFunc)
    85  }
    86  
    87  // Server represents a gRPC server that can handle both XDS and ext_authz
    88  // requests from Envoy. All of it's public members must be set before the gRPC
    89  // server is started.
    90  //
    91  // A full description of the XDS protocol can be found at
    92  // https://github.com/envoyproxy/data-plane-api/blob/master/XDS_PROTOCOL.md
    93  type Server struct {
    94  	Logger       *log.Logger
    95  	CfgMgr       ConfigManager
    96  	Authz        ConnectAuthz
    97  	ResolveToken ACLResolverFunc
    98  	// AuthCheckFrequency is how often we should re-check the credentials used
    99  	// during a long-lived gRPC Stream after it has been initially established.
   100  	// This is only used during idle periods of stream interactions (i.e. when
   101  	// there has been no recent DiscoveryRequest).
   102  	AuthCheckFrequency time.Duration
   103  }
   104  
   105  // Initialize will finish configuring the Server for first use.
   106  func (s *Server) Initialize() {
   107  	if s.AuthCheckFrequency == 0 {
   108  		s.AuthCheckFrequency = DefaultAuthCheckFrequency
   109  	}
   110  }
   111  
   112  // StreamAggregatedResources implements
   113  // envoydisco.AggregatedDiscoveryServiceServer. This is the ADS endpoint which is
   114  // the only xDS API we directly support for now.
   115  func (s *Server) StreamAggregatedResources(stream ADSStream) error {
   116  	// a channel for receiving incoming requests
   117  	reqCh := make(chan *envoy.DiscoveryRequest)
   118  	reqStop := int32(0)
   119  	go func() {
   120  		for {
   121  			req, err := stream.Recv()
   122  			if atomic.LoadInt32(&reqStop) != 0 {
   123  				return
   124  			}
   125  			if err != nil {
   126  				close(reqCh)
   127  				return
   128  			}
   129  			reqCh <- req
   130  		}
   131  	}()
   132  
   133  	err := s.process(stream, reqCh)
   134  	if err != nil {
   135  		s.Logger.Printf("[DEBUG] Error handling ADS stream: %s", err)
   136  	}
   137  
   138  	// prevents writing to a closed channel if send failed on blocked recv
   139  	atomic.StoreInt32(&reqStop, 1)
   140  
   141  	return err
   142  }
   143  
   144  const (
   145  	stateInit int = iota
   146  	statePendingInitialConfig
   147  	stateRunning
   148  )
   149  
   150  func (s *Server) process(stream ADSStream, reqCh <-chan *envoy.DiscoveryRequest) error {
   151  	// xDS requires a unique nonce to correlate response/request pairs
   152  	var nonce uint64
   153  
   154  	// xDS works with versions of configs. Internally we don't have a consistent
   155  	// version. We could just hash the config since versions don't have to be
   156  	// ordered as far as I can tell, but it's cheaper just to increment a counter
   157  	// every time we observe a new config since the upstream proxycfg package only
   158  	// delivers updates when there are actual changes.
   159  	var configVersion uint64
   160  
   161  	// Loop state
   162  	var cfgSnap *proxycfg.ConfigSnapshot
   163  	var req *envoy.DiscoveryRequest
   164  	var ok bool
   165  	var stateCh <-chan *proxycfg.ConfigSnapshot
   166  	var watchCancel func()
   167  	var proxyID string
   168  
   169  	// need to run a small state machine to get through initial authentication.
   170  	var state = stateInit
   171  
   172  	// Configure handlers for each type of request
   173  	handlers := map[string]*xDSType{
   174  		EndpointType: &xDSType{
   175  			typeURL:   EndpointType,
   176  			resources: endpointsFromSnapshot,
   177  			stream:    stream,
   178  		},
   179  		ClusterType: &xDSType{
   180  			typeURL:   ClusterType,
   181  			resources: clustersFromSnapshot,
   182  			stream:    stream,
   183  		},
   184  		RouteType: &xDSType{
   185  			typeURL:   RouteType,
   186  			resources: routesFromSnapshot,
   187  			stream:    stream,
   188  		},
   189  		ListenerType: &xDSType{
   190  			typeURL:   ListenerType,
   191  			resources: listenersFromSnapshot,
   192  			stream:    stream,
   193  		},
   194  	}
   195  
   196  	var authTimer <-chan time.Time
   197  	extendAuthTimer := func() {
   198  		authTimer = time.After(s.AuthCheckFrequency)
   199  	}
   200  
   201  	checkStreamACLs := func(cfgSnap *proxycfg.ConfigSnapshot) error {
   202  		if cfgSnap == nil {
   203  			return status.Errorf(codes.Unauthenticated, "unauthenticated: no config snapshot")
   204  		}
   205  
   206  		token := tokenFromStream(stream)
   207  		rule, err := s.ResolveToken(token)
   208  
   209  		if acl.IsErrNotFound(err) {
   210  			return status.Errorf(codes.Unauthenticated, "unauthenticated: %v", err)
   211  		} else if acl.IsErrPermissionDenied(err) {
   212  			return status.Errorf(codes.PermissionDenied, "permission denied: %v", err)
   213  		} else if err != nil {
   214  			return err
   215  		}
   216  
   217  		if rule != nil && !rule.ServiceWrite(cfgSnap.Proxy.DestinationServiceName, nil) {
   218  			return status.Errorf(codes.PermissionDenied, "permission denied")
   219  		}
   220  
   221  		// Authed OK!
   222  		return nil
   223  	}
   224  
   225  	for {
   226  		select {
   227  		case <-authTimer:
   228  			// It's been too long since a Discovery{Request,Response} so recheck ACLs.
   229  			if err := checkStreamACLs(cfgSnap); err != nil {
   230  				return err
   231  			}
   232  			extendAuthTimer()
   233  
   234  		case req, ok = <-reqCh:
   235  			if !ok {
   236  				// reqCh is closed when stream.Recv errors which is how we detect client
   237  				// going away. AFAICT the stream.Context() is only canceled once the
   238  				// RPC method returns which it can't until we return from this one so
   239  				// there's no point in blocking on that.
   240  				return nil
   241  			}
   242  			if req.TypeUrl == "" {
   243  				return status.Errorf(codes.InvalidArgument, "type URL is required for ADS")
   244  			}
   245  			if handler, ok := handlers[req.TypeUrl]; ok {
   246  				handler.Recv(req)
   247  			}
   248  		case cfgSnap = <-stateCh:
   249  			// We got a new config, update the version counter
   250  			configVersion++
   251  		}
   252  
   253  		// Trigger state machine
   254  		switch state {
   255  		case stateInit:
   256  			if req == nil {
   257  				// This can't happen (tm) since stateCh is nil until after the first req
   258  				// is received but lets not panic about it.
   259  				continue
   260  			}
   261  			// Start authentication process, we need the proxyID
   262  			proxyID = req.Node.Id
   263  
   264  			// Start watching config for that proxy
   265  			stateCh, watchCancel = s.CfgMgr.Watch(proxyID)
   266  			// Note that in this case we _intend_ the defer to only be triggered when
   267  			// this whole process method ends (i.e. when streaming RPC aborts) not at
   268  			// the end of the current loop iteration. We have to do it in the loop
   269  			// here since we can't start watching until we get to this state in the
   270  			// state machine.
   271  			defer watchCancel()
   272  
   273  			// Now wait for the config so we can check ACL
   274  			state = statePendingInitialConfig
   275  		case statePendingInitialConfig:
   276  			if cfgSnap == nil {
   277  				// Nothing we can do until we get the initial config
   278  				continue
   279  			}
   280  
   281  			// Got config, try to authenticate next.
   282  			state = stateRunning
   283  
   284  			// Lets actually process the config we just got or we'll mis responding
   285  			fallthrough
   286  		case stateRunning:
   287  			// Check ACLs on every Discovery{Request,Response}.
   288  			if err := checkStreamACLs(cfgSnap); err != nil {
   289  				return err
   290  			}
   291  			// For the first time through the state machine, this is when the
   292  			// timer is first started.
   293  			extendAuthTimer()
   294  
   295  			// See if any handlers need to have the current (possibly new) config
   296  			// sent. Note the order here is actually significant so we can't just
   297  			// range the map which has no determined order. It's important because:
   298  			//
   299  			//  1. Envoy needs to see a consistent snapshot to avoid potentially
   300  			//     dropping traffic due to inconsistencies. This is the
   301  			//     main win of ADS after all - we get to control this order.
   302  			//  2. Non-determinsic order of complex protobuf responses which are
   303  			//     compared for non-exact JSON equivalence makes the tests uber-messy
   304  			//     to handle
   305  			for _, typeURL := range []string{ClusterType, EndpointType, RouteType, ListenerType} {
   306  				handler := handlers[typeURL]
   307  				if err := handler.SendIfNew(cfgSnap, configVersion, &nonce); err != nil {
   308  					return err
   309  				}
   310  			}
   311  		}
   312  	}
   313  }
   314  
   315  type xDSType struct {
   316  	typeURL   string
   317  	stream    ADSStream
   318  	req       *envoy.DiscoveryRequest
   319  	lastNonce string
   320  	// lastVersion is the version that was last sent to the proxy. It is needed
   321  	// because we don't want to send the same version more than once.
   322  	// req.VersionInfo may be an older version than the most recent once sent in
   323  	// two cases: 1) if the ACK wasn't received yet and `req` still points to the
   324  	// previous request we already responded to and 2) if the proxy rejected the
   325  	// last version we sent with a Nack then req.VersionInfo will be the older
   326  	// version it's hanging on to.
   327  	lastVersion uint64
   328  	resources   func(cfgSnap *proxycfg.ConfigSnapshot, token string) ([]proto.Message, error)
   329  }
   330  
   331  func (t *xDSType) Recv(req *envoy.DiscoveryRequest) {
   332  	if t.lastNonce == "" || t.lastNonce == req.GetResponseNonce() {
   333  		t.req = req
   334  	}
   335  }
   336  
   337  func (t *xDSType) SendIfNew(cfgSnap *proxycfg.ConfigSnapshot, version uint64, nonce *uint64) error {
   338  	if t.req == nil {
   339  		return nil
   340  	}
   341  	if t.lastVersion >= version {
   342  		// Already sent this version
   343  		return nil
   344  	}
   345  	resources, err := t.resources(cfgSnap, tokenFromStream(t.stream))
   346  	if err != nil {
   347  		return err
   348  	}
   349  	if resources == nil || len(resources) == 0 {
   350  		// Nothing to send yet
   351  		return nil
   352  	}
   353  
   354  	// Note we only increment nonce when we actually send - not important for
   355  	// correctness but makes tests much simpler when we skip a type like Routes
   356  	// with nothing to send.
   357  	*nonce++
   358  	nonceStr := fmt.Sprintf("%08x", *nonce)
   359  	versionStr := fmt.Sprintf("%08x", version)
   360  
   361  	resp, err := createResponse(t.typeURL, versionStr, nonceStr, resources)
   362  	if err != nil {
   363  		return err
   364  	}
   365  
   366  	err = t.stream.Send(resp)
   367  	if err != nil {
   368  		return err
   369  	}
   370  	t.lastVersion = version
   371  	t.lastNonce = nonceStr
   372  	return nil
   373  }
   374  
   375  func tokenFromStream(stream ADSStream) string {
   376  	return tokenFromContext(stream.Context())
   377  }
   378  
   379  func tokenFromContext(ctx context.Context) string {
   380  	md, ok := metadata.FromIncomingContext(ctx)
   381  	if !ok {
   382  		return ""
   383  	}
   384  	toks, ok := md["x-consul-token"]
   385  	if ok && len(toks) > 0 {
   386  		return toks[0]
   387  	}
   388  	return ""
   389  }
   390  
   391  // IncrementalAggregatedResources implements envoydisco.AggregatedDiscoveryServiceServer
   392  func (s *Server) IncrementalAggregatedResources(_ envoydisco.AggregatedDiscoveryService_IncrementalAggregatedResourcesServer) error {
   393  	return errors.New("not implemented")
   394  }
   395  
   396  func deniedResponse(reason string) (*envoyauthz.CheckResponse, error) {
   397  	return &envoyauthz.CheckResponse{
   398  		Status: &rpc.Status{
   399  			Code:    int32(rpc.PERMISSION_DENIED),
   400  			Message: "Denied: " + reason,
   401  		},
   402  	}, nil
   403  }
   404  
   405  // Check implements envoyauthz.AuthorizationServer.
   406  func (s *Server) Check(ctx context.Context, r *envoyauthz.CheckRequest) (*envoyauthz.CheckResponse, error) {
   407  	// Sanity checks
   408  	if r.Attributes == nil || r.Attributes.Source == nil || r.Attributes.Destination == nil {
   409  		return nil, status.Error(codes.InvalidArgument, "source and destination attributes are required")
   410  	}
   411  	if r.Attributes.Source.Principal == "" || r.Attributes.Destination.Principal == "" {
   412  		return nil, status.Error(codes.InvalidArgument, "source and destination Principal are required")
   413  	}
   414  
   415  	// Parse destination to know the target service
   416  	dest, err := connect.ParseCertURIFromString(r.Attributes.Destination.Principal)
   417  	if err != nil {
   418  		// Treat this as an auth error since Envoy has sent something it considers
   419  		// valid, it's just not an identity we trust.
   420  		return deniedResponse("Destination Principal is not a valid Connect identity")
   421  	}
   422  
   423  	destID, ok := dest.(*connect.SpiffeIDService)
   424  	if !ok {
   425  		return deniedResponse("Destination Principal is not a valid Service identity")
   426  	}
   427  
   428  	// For now we don't validate the trust domain of the _destination_ at all -
   429  	// the HTTP Authorize endpoint just accepts a target _service_ and it's
   430  	// implicit that the request is for the correct cluster. We might want to
   431  	// reconsider this later but plumbing in additional machinery to check the
   432  	// clusterID here is not really necessary for now unless Envoys are badly
   433  	// configured. Our threat model _requires_ correctly configured and well
   434  	// behaved proxies given that they have ACLs to fetch certs and so can do
   435  	// whatever they want including not authorizing traffic at all or routing it
   436  	// do a different service than they auth'd against.
   437  
   438  	// Create an authz request
   439  	req := &structs.ConnectAuthorizeRequest{
   440  		Target:        destID.Service,
   441  		ClientCertURI: r.Attributes.Source.Principal,
   442  		// TODO(banks): need Envoy to support sending cert serial/hash to enforce
   443  		// revocation later.
   444  	}
   445  	token := tokenFromContext(ctx)
   446  	authed, reason, _, err := s.Authz.ConnectAuthorize(token, req)
   447  	if err != nil {
   448  		if err == acl.ErrPermissionDenied {
   449  			return nil, status.Error(codes.PermissionDenied, err.Error())
   450  		}
   451  		return nil, status.Error(codes.Internal, err.Error())
   452  	}
   453  	if !authed {
   454  		return deniedResponse(reason)
   455  	}
   456  
   457  	return &envoyauthz.CheckResponse{
   458  		Status: &rpc.Status{
   459  			Code:    int32(rpc.OK),
   460  			Message: "ALLOWED: " + reason,
   461  		},
   462  	}, nil
   463  }
   464  
   465  // GRPCServer returns a server instance that can handle XDS and ext_authz
   466  // requests.
   467  func (s *Server) GRPCServer(certFile, keyFile string) (*grpc.Server, error) {
   468  	opts := []grpc.ServerOption{
   469  		grpc.MaxConcurrentStreams(2048),
   470  	}
   471  	if certFile != "" && keyFile != "" {
   472  		creds, err := credentials.NewServerTLSFromFile(certFile, keyFile)
   473  		if err != nil {
   474  			return nil, err
   475  		}
   476  		opts = append(opts, grpc.Creds(creds))
   477  	}
   478  	srv := grpc.NewServer(opts...)
   479  	envoydisco.RegisterAggregatedDiscoveryServiceServer(srv, s)
   480  	envoyauthz.RegisterAuthorizationServer(srv, s)
   481  	return srv, nil
   482  }