github.com/sl1pm4t/consul@v1.4.5-0.20190325224627-74c31c540f9c/agent/xds/server.go (about)

     1  package xds
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"log"
     8  	"sync/atomic"
     9  	"time"
    10  
    11  	"google.golang.org/grpc"
    12  	"google.golang.org/grpc/codes"
    13  	"google.golang.org/grpc/credentials"
    14  	"google.golang.org/grpc/metadata"
    15  	"google.golang.org/grpc/status"
    16  
    17  	envoy "github.com/envoyproxy/go-control-plane/envoy/api/v2"
    18  	envoyauthz "github.com/envoyproxy/go-control-plane/envoy/service/auth/v2alpha"
    19  	envoydisco "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v2"
    20  	"github.com/gogo/googleapis/google/rpc"
    21  	"github.com/gogo/protobuf/proto"
    22  	"github.com/hashicorp/consul/acl"
    23  	"github.com/hashicorp/consul/agent/cache"
    24  	"github.com/hashicorp/consul/agent/connect"
    25  	"github.com/hashicorp/consul/agent/proxycfg"
    26  	"github.com/hashicorp/consul/agent/structs"
    27  )
    28  
    29  // ADSStream is a shorter way of referring to this thing...
    30  type ADSStream = envoydisco.AggregatedDiscoveryService_StreamAggregatedResourcesServer
    31  
    32  const (
    33  	// Resource types in xDS v2. These are copied from
    34  	// envoyproxy/go-control-plane/pkg/cache/resource.go since we don't need any of
    35  	// the rest of that package.
    36  	typePrefix = "type.googleapis.com/envoy.api.v2."
    37  
    38  	// EndpointType is the TypeURL for Endpoint discovery responses.
    39  	EndpointType = typePrefix + "ClusterLoadAssignment"
    40  
    41  	// ClusterType is the TypeURL for Cluster discovery responses.
    42  	ClusterType = typePrefix + "Cluster"
    43  
    44  	// RouteType is the TypeURL for Route discovery responses.
    45  	RouteType = typePrefix + "RouteConfiguration"
    46  
    47  	// ListenerType is the TypeURL for Listener discovery responses.
    48  	ListenerType = typePrefix + "Listener"
    49  
    50  	// PublicListenerName is the name we give the public listener in Envoy config.
    51  	PublicListenerName = "public_listener"
    52  
    53  	// LocalAppClusterName is the name we give the local application "cluster" in
    54  	// Envoy config.
    55  	LocalAppClusterName = "local_app"
    56  
    57  	// LocalAgentClusterName is the name we give the local agent "cluster" in
    58  	// Envoy config.
    59  	LocalAgentClusterName = "local_agent"
    60  
    61  	// DefaultAuthCheckFrequency is the default value for
    62  	// Server.AuthCheckFrequency to use when the zero value is provided.
    63  	DefaultAuthCheckFrequency = 5 * time.Minute
    64  )
    65  
    66  // ACLResolverFunc is a shim to resolve ACLs. Since ACL enforcement is so far
    67  // entirely agent-local and all uses private methods this allows a simple shim
    68  // to be written in the agent package to allow resolving without tightly
    69  // coupling this to the agent.
    70  type ACLResolverFunc func(id string) (acl.Authorizer, error)
    71  
    72  // ConnectAuthz is the interface the agent needs to expose to be able to re-use
    73  // the authorization logic between both APIs.
    74  type ConnectAuthz interface {
    75  	// ConnectAuthorize is implemented by Agent.ConnectAuthorize
    76  	ConnectAuthorize(token string, req *structs.ConnectAuthorizeRequest) (authz bool, reason string, m *cache.ResultMeta, err error)
    77  }
    78  
    79  // ConfigManager is the interface xds.Server requires to consume proxy config
    80  // updates. It's satisfied normally by the agent's proxycfg.Manager, but allows
    81  // easier testing without several layers of mocked cache, local state and
    82  // proxycfg.Manager.
    83  type ConfigManager interface {
    84  	Watch(proxyID string) (<-chan *proxycfg.ConfigSnapshot, proxycfg.CancelFunc)
    85  }
    86  
    87  // Server represents a gRPC server that can handle both XDS and ext_authz
    88  // requests from Envoy. All of it's public members must be set before the gRPC
    89  // server is started.
    90  //
    91  // A full description of the XDS protocol can be found at
    92  // https://github.com/envoyproxy/data-plane-api/blob/master/XDS_PROTOCOL.md
    93  type Server struct {
    94  	Logger       *log.Logger
    95  	CfgMgr       ConfigManager
    96  	Authz        ConnectAuthz
    97  	ResolveToken ACLResolverFunc
    98  	// AuthCheckFrequency is how often we should re-check the credentials used
    99  	// during a long-lived gRPC Stream after it has been initially established.
   100  	// This is only used during idle periods of stream interactions (i.e. when
   101  	// there has been no recent DiscoveryRequest).
   102  	AuthCheckFrequency time.Duration
   103  }
   104  
   105  // Initialize will finish configuring the Server for first use.
   106  func (s *Server) Initialize() {
   107  	if s.AuthCheckFrequency == 0 {
   108  		s.AuthCheckFrequency = DefaultAuthCheckFrequency
   109  	}
   110  }
   111  
   112  // StreamAggregatedResources implements
   113  // envoydisco.AggregatedDiscoveryServiceServer. This is the ADS endpoint which is
   114  // the only xDS API we directly support for now.
   115  func (s *Server) StreamAggregatedResources(stream ADSStream) error {
   116  	// a channel for receiving incoming requests
   117  	reqCh := make(chan *envoy.DiscoveryRequest)
   118  	reqStop := int32(0)
   119  	go func() {
   120  		for {
   121  			req, err := stream.Recv()
   122  			if atomic.LoadInt32(&reqStop) != 0 {
   123  				return
   124  			}
   125  			if err != nil {
   126  				close(reqCh)
   127  				return
   128  			}
   129  			reqCh <- req
   130  		}
   131  	}()
   132  
   133  	err := s.process(stream, reqCh)
   134  	if err != nil {
   135  		s.Logger.Printf("[DEBUG] Error handling ADS stream: %s", err)
   136  	}
   137  
   138  	// prevents writing to a closed channel if send failed on blocked recv
   139  	atomic.StoreInt32(&reqStop, 1)
   140  
   141  	return err
   142  }
   143  
   144  const (
   145  	stateInit int = iota
   146  	statePendingInitialConfig
   147  	stateRunning
   148  )
   149  
   150  func (s *Server) process(stream ADSStream, reqCh <-chan *envoy.DiscoveryRequest) error {
   151  	// xDS requires a unique nonce to correlate response/request pairs
   152  	var nonce uint64
   153  
   154  	// xDS works with versions of configs. Internally we don't have a consistent
   155  	// version. We could just hash the config since versions don't have to be
   156  	// ordered as far as I can tell, but it's cheaper just to increment a counter
   157  	// every time we observe a new config since the upstream proxycfg package only
   158  	// delivers updates when there are actual changes.
   159  	var configVersion uint64
   160  
   161  	// Loop state
   162  	var cfgSnap *proxycfg.ConfigSnapshot
   163  	var req *envoy.DiscoveryRequest
   164  	var ok bool
   165  	var stateCh <-chan *proxycfg.ConfigSnapshot
   166  	var watchCancel func()
   167  	var proxyID string
   168  
   169  	// need to run a small state machine to get through initial authentication.
   170  	var state = stateInit
   171  
   172  	// Configure handlers for each type of request
   173  	handlers := map[string]*xDSType{
   174  		EndpointType: &xDSType{
   175  			typeURL:   EndpointType,
   176  			resources: endpointsFromSnapshot,
   177  			stream:    stream,
   178  		},
   179  		ClusterType: &xDSType{
   180  			typeURL:   ClusterType,
   181  			resources: clustersFromSnapshot,
   182  			stream:    stream,
   183  		},
   184  		RouteType: &xDSType{
   185  			typeURL:   RouteType,
   186  			resources: routesFromSnapshot,
   187  			stream:    stream,
   188  		},
   189  		ListenerType: &xDSType{
   190  			typeURL:   ListenerType,
   191  			resources: listenersFromSnapshot,
   192  			stream:    stream,
   193  		},
   194  	}
   195  
   196  	var authTimer <-chan time.Time
   197  	extendAuthTimer := func() {
   198  		authTimer = time.After(s.AuthCheckFrequency)
   199  	}
   200  
   201  	checkStreamACLs := func(cfgSnap *proxycfg.ConfigSnapshot) error {
   202  		if cfgSnap == nil {
   203  			return status.Errorf(codes.Unauthenticated, "unauthenticated: no config snapshot")
   204  		}
   205  
   206  		token := tokenFromStream(stream)
   207  		rule, err := s.ResolveToken(token)
   208  
   209  		if acl.IsErrNotFound(err) {
   210  			return status.Errorf(codes.Unauthenticated, "unauthenticated: %v", err)
   211  		} else if acl.IsErrPermissionDenied(err) {
   212  			return status.Errorf(codes.PermissionDenied, "permission denied: %v", err)
   213  		} else if err != nil {
   214  			return err
   215  		}
   216  
   217  		if rule != nil && !rule.ServiceWrite(cfgSnap.Proxy.DestinationServiceName, nil) {
   218  			return status.Errorf(codes.PermissionDenied, "permission denied")
   219  		}
   220  
   221  		// Authed OK!
   222  		return nil
   223  	}
   224  
   225  	for {
   226  		select {
   227  		case <-authTimer:
   228  			// It's been too long since a Discovery{Request,Response} so recheck ACLs.
   229  			if err := checkStreamACLs(cfgSnap); err != nil {
   230  				return err
   231  			}
   232  			extendAuthTimer()
   233  
   234  		case req, ok = <-reqCh:
   235  			if !ok {
   236  				// reqCh is closed when stream.Recv errors which is how we detect client
   237  				// going away. AFAICT the stream.Context() is only canceled once the
   238  				// RPC method returns which it can't until we return from this one so
   239  				// there's no point in blocking on that.
   240  				return nil
   241  			}
   242  			if req.TypeUrl == "" {
   243  				return status.Errorf(codes.InvalidArgument, "type URL is required for ADS")
   244  			}
   245  			if handler, ok := handlers[req.TypeUrl]; ok {
   246  				handler.Recv(req)
   247  			}
   248  		case cfgSnap = <-stateCh:
   249  			// We got a new config, update the version counter
   250  			configVersion++
   251  		}
   252  
   253  		// Trigger state machine
   254  		switch state {
   255  		case stateInit:
   256  			if req == nil {
   257  				// This can't happen (tm) since stateCh is nil until after the first req
   258  				// is received but lets not panic about it.
   259  				continue
   260  			}
   261  			// Start authentication process, we need the proxyID
   262  			proxyID = req.Node.Id
   263  
   264  			// Start watching config for that proxy
   265  			stateCh, watchCancel = s.CfgMgr.Watch(proxyID)
   266  			// Note that in this case we _intend_ the defer to only be triggered when
   267  			// this whole process method ends (i.e. when streaming RPC aborts) not at
   268  			// the end of the current loop iteration. We have to do it in the loop
   269  			// here since we can't start watching until we get to this state in the
   270  			// state machine.
   271  			defer watchCancel()
   272  
   273  			// Now wait for the config so we can check ACL
   274  			state = statePendingInitialConfig
   275  		case statePendingInitialConfig:
   276  			if cfgSnap == nil {
   277  				// Nothing we can do until we get the initial config
   278  				continue
   279  			}
   280  
   281  			// Got config, try to authenticate next.
   282  			state = stateRunning
   283  
   284  			// Lets actually process the config we just got or we'll mis responding
   285  			fallthrough
   286  		case stateRunning:
   287  			// Check ACLs on every Discovery{Request,Response}.
   288  			if err := checkStreamACLs(cfgSnap); err != nil {
   289  				return err
   290  			}
   291  			// For the first time through the state machine, this is when the
   292  			// timer is first started.
   293  			extendAuthTimer()
   294  
   295  			// See if any handlers need to have the current (possibly new) config
   296  			// sent. Note the order here is actually significant so we can't just
   297  			// range the map which has no determined order. It's important because:
   298  			//
   299  			//  1. Envoy needs to see a consistent snapshot to avoid potentially
   300  			//     dropping traffic due to inconsistencies. This is the
   301  			//     main win of ADS after all - we get to control this order.
   302  			//  2. Non-determinsic order of complex protobuf responses which are
   303  			//     compared for non-exact JSON equivalence makes the tests uber-messy
   304  			//     to handle
   305  			for _, typeURL := range []string{ClusterType, EndpointType, RouteType, ListenerType} {
   306  				handler := handlers[typeURL]
   307  				if err := handler.SendIfNew(cfgSnap, configVersion, &nonce); err != nil {
   308  					return err
   309  				}
   310  			}
   311  		}
   312  	}
   313  }
   314  
   315  type xDSType struct {
   316  	typeURL   string
   317  	stream    ADSStream
   318  	req       *envoy.DiscoveryRequest
   319  	lastNonce string
   320  	// lastVersion is the version that was last sent to the proxy. It is needed
   321  	// because we don't want to send the same version more than once.
   322  	// req.VersionInfo may be an older version than the most recent once sent in
   323  	// two cases: 1) if the ACK wasn't received yet and `req` still points to the
   324  	// previous request we already responded to and 2) if the proxy rejected the
   325  	// last version we sent with a Nack then req.VersionInfo will be the older
   326  	// version it's hanging on to.
   327  	lastVersion uint64
   328  	resources   func(cfgSnap *proxycfg.ConfigSnapshot, token string) ([]proto.Message, error)
   329  }
   330  
   331  func (t *xDSType) Recv(req *envoy.DiscoveryRequest) {
   332  	if t.lastNonce == "" || t.lastNonce == req.GetResponseNonce() {
   333  		t.req = req
   334  	}
   335  }
   336  
   337  func (t *xDSType) SendIfNew(cfgSnap *proxycfg.ConfigSnapshot, version uint64, nonce *uint64) error {
   338  	if t.req == nil {
   339  		return nil
   340  	}
   341  	if t.lastVersion >= version {
   342  		// Already sent this version
   343  		return nil
   344  	}
   345  	resources, err := t.resources(cfgSnap, tokenFromStream(t.stream))
   346  	if err != nil {
   347  		return err
   348  	}
   349  	// Zero length resource responses should be ignored and are the result of no
   350  	// data yet. Notice that this caused a bug originally where we had zero
   351  	// healthy endpoints for an upstream that would cause Envoy to hang waiting
   352  	// for the EDS response. This is fixed though by ensuring we send an explicit
   353  	// empty LoadAssignment resource for the cluster rather than allowing junky
   354  	// empty resources.
   355  	if len(resources) == 0 {
   356  		// Nothing to send yet
   357  		return nil
   358  	}
   359  
   360  	// Note we only increment nonce when we actually send - not important for
   361  	// correctness but makes tests much simpler when we skip a type like Routes
   362  	// with nothing to send.
   363  	*nonce++
   364  	nonceStr := fmt.Sprintf("%08x", *nonce)
   365  	versionStr := fmt.Sprintf("%08x", version)
   366  
   367  	resp, err := createResponse(t.typeURL, versionStr, nonceStr, resources)
   368  	if err != nil {
   369  		return err
   370  	}
   371  
   372  	err = t.stream.Send(resp)
   373  	if err != nil {
   374  		return err
   375  	}
   376  	t.lastVersion = version
   377  	t.lastNonce = nonceStr
   378  	return nil
   379  }
   380  
   381  func tokenFromStream(stream ADSStream) string {
   382  	return tokenFromContext(stream.Context())
   383  }
   384  
   385  func tokenFromContext(ctx context.Context) string {
   386  	md, ok := metadata.FromIncomingContext(ctx)
   387  	if !ok {
   388  		return ""
   389  	}
   390  	toks, ok := md["x-consul-token"]
   391  	if ok && len(toks) > 0 {
   392  		return toks[0]
   393  	}
   394  	return ""
   395  }
   396  
   397  // IncrementalAggregatedResources implements envoydisco.AggregatedDiscoveryServiceServer
   398  func (s *Server) IncrementalAggregatedResources(_ envoydisco.AggregatedDiscoveryService_IncrementalAggregatedResourcesServer) error {
   399  	return errors.New("not implemented")
   400  }
   401  
   402  func deniedResponse(reason string) (*envoyauthz.CheckResponse, error) {
   403  	return &envoyauthz.CheckResponse{
   404  		Status: &rpc.Status{
   405  			Code:    int32(rpc.PERMISSION_DENIED),
   406  			Message: "Denied: " + reason,
   407  		},
   408  	}, nil
   409  }
   410  
   411  // Check implements envoyauthz.AuthorizationServer.
   412  func (s *Server) Check(ctx context.Context, r *envoyauthz.CheckRequest) (*envoyauthz.CheckResponse, error) {
   413  	// Sanity checks
   414  	if r.Attributes == nil || r.Attributes.Source == nil || r.Attributes.Destination == nil {
   415  		return nil, status.Error(codes.InvalidArgument, "source and destination attributes are required")
   416  	}
   417  	if r.Attributes.Source.Principal == "" || r.Attributes.Destination.Principal == "" {
   418  		return nil, status.Error(codes.InvalidArgument, "source and destination Principal are required")
   419  	}
   420  
   421  	// Parse destination to know the target service
   422  	dest, err := connect.ParseCertURIFromString(r.Attributes.Destination.Principal)
   423  	if err != nil {
   424  		// Treat this as an auth error since Envoy has sent something it considers
   425  		// valid, it's just not an identity we trust.
   426  		return deniedResponse("Destination Principal is not a valid Connect identity")
   427  	}
   428  
   429  	destID, ok := dest.(*connect.SpiffeIDService)
   430  	if !ok {
   431  		return deniedResponse("Destination Principal is not a valid Service identity")
   432  	}
   433  
   434  	// For now we don't validate the trust domain of the _destination_ at all -
   435  	// the HTTP Authorize endpoint just accepts a target _service_ and it's
   436  	// implicit that the request is for the correct cluster. We might want to
   437  	// reconsider this later but plumbing in additional machinery to check the
   438  	// clusterID here is not really necessary for now unless Envoys are badly
   439  	// configured. Our threat model _requires_ correctly configured and well
   440  	// behaved proxies given that they have ACLs to fetch certs and so can do
   441  	// whatever they want including not authorizing traffic at all or routing it
   442  	// do a different service than they auth'd against.
   443  
   444  	// Create an authz request
   445  	req := &structs.ConnectAuthorizeRequest{
   446  		Target:        destID.Service,
   447  		ClientCertURI: r.Attributes.Source.Principal,
   448  		// TODO(banks): need Envoy to support sending cert serial/hash to enforce
   449  		// revocation later.
   450  	}
   451  	token := tokenFromContext(ctx)
   452  	authed, reason, _, err := s.Authz.ConnectAuthorize(token, req)
   453  	if err != nil {
   454  		if err == acl.ErrPermissionDenied {
   455  			return nil, status.Error(codes.PermissionDenied, err.Error())
   456  		}
   457  		return nil, status.Error(codes.Internal, err.Error())
   458  	}
   459  	if !authed {
   460  		return deniedResponse(reason)
   461  	}
   462  
   463  	return &envoyauthz.CheckResponse{
   464  		Status: &rpc.Status{
   465  			Code:    int32(rpc.OK),
   466  			Message: "ALLOWED: " + reason,
   467  		},
   468  	}, nil
   469  }
   470  
   471  // GRPCServer returns a server instance that can handle XDS and ext_authz
   472  // requests.
   473  func (s *Server) GRPCServer(certFile, keyFile string) (*grpc.Server, error) {
   474  	opts := []grpc.ServerOption{
   475  		grpc.MaxConcurrentStreams(2048),
   476  	}
   477  	if certFile != "" && keyFile != "" {
   478  		creds, err := credentials.NewServerTLSFromFile(certFile, keyFile)
   479  		if err != nil {
   480  			return nil, err
   481  		}
   482  		opts = append(opts, grpc.Creds(creds))
   483  	}
   484  	srv := grpc.NewServer(opts...)
   485  	envoydisco.RegisterAggregatedDiscoveryServiceServer(srv, s)
   486  	envoyauthz.RegisterAuthorizationServer(srv, s)
   487  	return srv, nil
   488  }