github.com/telepresenceio/telepresence/v2@v2.20.0-pro.6.0.20240517030216-236ea954e789/pkg/client/userd/daemon/service.go (about)

     1  package daemon
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"net"
     8  	"os"
     9  	"path/filepath"
    10  	"strings"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	"github.com/spf13/cobra"
    16  	"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
    17  	"google.golang.org/grpc"
    18  
    19  	"github.com/datawire/dlib/dgroup"
    20  	"github.com/datawire/dlib/dhttp"
    21  	"github.com/datawire/dlib/dlog"
    22  	"github.com/telepresenceio/telepresence/rpc/v2/common"
    23  	rpc "github.com/telepresenceio/telepresence/rpc/v2/connector"
    24  	"github.com/telepresenceio/telepresence/rpc/v2/manager"
    25  	"github.com/telepresenceio/telepresence/v2/pkg/client"
    26  	"github.com/telepresenceio/telepresence/v2/pkg/client/cli/daemon"
    27  	"github.com/telepresenceio/telepresence/v2/pkg/client/logging"
    28  	"github.com/telepresenceio/telepresence/v2/pkg/client/remotefs"
    29  	"github.com/telepresenceio/telepresence/v2/pkg/client/scout"
    30  	"github.com/telepresenceio/telepresence/v2/pkg/client/socket"
    31  	"github.com/telepresenceio/telepresence/v2/pkg/client/userd"
    32  	"github.com/telepresenceio/telepresence/v2/pkg/client/userd/trafficmgr"
    33  	"github.com/telepresenceio/telepresence/v2/pkg/errcat"
    34  	"github.com/telepresenceio/telepresence/v2/pkg/filelocation"
    35  	"github.com/telepresenceio/telepresence/v2/pkg/log"
    36  	"github.com/telepresenceio/telepresence/v2/pkg/pprof"
    37  	"github.com/telepresenceio/telepresence/v2/pkg/proc"
    38  	"github.com/telepresenceio/telepresence/v2/pkg/tracing"
    39  )
    40  
    41  const titleName = "Connector"
    42  
    43  func help() string {
    44  	return `The Telepresence ` + titleName + ` is a background component that manages a connection.
    45  
    46  Launch the Telepresence ` + titleName + `:
    47      telepresence connect
    48  
    49  Examine the ` + titleName + `'s log output in
    50      ` + filepath.Join(filelocation.AppUserLogDir(context.Background()), userd.ProcessName+".log") + `
    51  to troubleshoot problems.
    52  `
    53  }
    54  
    55  // service represents the long-running state of the Telepresence User Daemon.
    56  type service struct {
    57  	rpc.UnsafeConnectorServer
    58  	srv           *grpc.Server
    59  	managerProxy  *mgrProxy
    60  	timedLogLevel log.TimedLevel
    61  	ucn           int64
    62  	fuseFTPError  error
    63  
    64  	// The quit function that quits the server.
    65  	quit func()
    66  
    67  	// quitDisable will temporarily disable the quit function. This is used when there's a desire
    68  	// to cancel the session without cancelling the process although the simplified session management
    69  	// is in effect (rootSessionInProc == true).
    70  	quitDisable bool
    71  
    72  	session         userd.Session
    73  	sessionCancel   context.CancelFunc
    74  	sessionContext  context.Context
    75  	sessionQuitting int32 // atomic boolean. True if non-zero.
    76  	sessionLock     sync.RWMutex
    77  
    78  	// These are used to communicate between the various goroutines.
    79  	connectRequest  chan *rpc.ConnectRequest // server-grpc.connect() -> connectWorker
    80  	connectResponse chan *rpc.ConnectInfo    // connectWorker -> server-grpc.connect()
    81  
    82  	fuseFtpMgr remotefs.FuseFTPManager
    83  
    84  	// Run root session in-process
    85  	rootSessionInProc bool
    86  
    87  	// The TCP address that the daemon listens to. Will be nil if the daemon listens to a unix socket.
    88  	daemonAddress *net.TCPAddr
    89  
    90  	// Possibly extended version of the service. Use when calling interface methods.
    91  	self userd.Service
    92  }
    93  
    94  func NewService(ctx context.Context, _ *dgroup.Group, cfg client.Config, srv *grpc.Server) (userd.Service, error) {
    95  	s := &service{
    96  		srv:             srv,
    97  		connectRequest:  make(chan *rpc.ConnectRequest),
    98  		connectResponse: make(chan *rpc.ConnectInfo),
    99  		managerProxy:    &mgrProxy{},
   100  		timedLogLevel:   log.NewTimedLevel(cfg.LogLevels().UserDaemon.String(), log.SetLevel),
   101  		fuseFtpMgr:      remotefs.NewFuseFTPManager(),
   102  	}
   103  	s.self = s
   104  	if srv != nil {
   105  		// The podd daemon never registers the gRPC servers
   106  		rpc.RegisterConnectorServer(srv, s)
   107  		rpc.RegisterManagerProxyServer(srv, s.managerProxy)
   108  		tracer, err := tracing.NewTraceServer(ctx, "user-daemon")
   109  		if err != nil {
   110  			return nil, err
   111  		}
   112  		common.RegisterTracingServer(srv, tracer)
   113  	} else {
   114  		s.rootSessionInProc = true
   115  		s.quit = func() {}
   116  	}
   117  	return s, nil
   118  }
   119  
   120  func (s *service) As(ptr any) {
   121  	switch ptr := ptr.(type) {
   122  	case **service:
   123  		*ptr = s
   124  	case *rpc.ConnectorServer:
   125  		*ptr = s
   126  	default:
   127  		panic(fmt.Sprintf("%T does not implement %T", s, ptr))
   128  	}
   129  }
   130  
   131  func (s *service) ListenerAddress(ctx context.Context) string {
   132  	if s.daemonAddress != nil {
   133  		return s.daemonAddress.String()
   134  	}
   135  	return "unix:" + socket.UserDaemonPath(ctx)
   136  }
   137  
   138  func (s *service) SetSelf(self userd.Service) {
   139  	s.self = self
   140  }
   141  
   142  func (s *service) FuseFTPMgr() remotefs.FuseFTPManager {
   143  	return s.fuseFtpMgr
   144  }
   145  
   146  func (s *service) RootSessionInProcess() bool {
   147  	return s.rootSessionInProc
   148  }
   149  
   150  func (s *service) Server() *grpc.Server {
   151  	return s.srv
   152  }
   153  
   154  func (s *service) SetManagerClient(managerClient manager.ManagerClient, callOptions ...grpc.CallOption) {
   155  	s.managerProxy.setClient(managerClient, callOptions...)
   156  }
   157  
   158  const (
   159  	nameFlag         = "name"
   160  	addressFlag      = "address"
   161  	embedNetworkFlag = "embed-network"
   162  	pprofFlag        = "pprof"
   163  )
   164  
   165  // Command returns the CLI sub-command for "connector-foreground".
   166  func Command() *cobra.Command {
   167  	c := &cobra.Command{
   168  		Use:    userd.ProcessName + "-foreground",
   169  		Short:  "Launch Telepresence " + titleName + " in the foreground (debug)",
   170  		Args:   cobra.ExactArgs(0),
   171  		Hidden: true,
   172  		Long:   help(),
   173  		RunE:   run,
   174  	}
   175  	flags := c.Flags()
   176  	flags.String(nameFlag, userd.ProcessName, "Daemon name")
   177  	flags.String(addressFlag, "", "Address to listen to. Defaults to "+socket.UserDaemonPath(context.Background()))
   178  	flags.Bool(embedNetworkFlag, false, "Embed network functionality in the user daemon. Requires capability NET_ADMIN")
   179  	flags.Uint16(pprofFlag, 0, "start pprof server on the given port")
   180  	return c
   181  }
   182  
   183  func (s *service) configReload(c context.Context) error {
   184  	// Ensure that the directory to watch exists.
   185  	if err := os.MkdirAll(filepath.Dir(client.GetConfigFile(c)), 0o755); err != nil {
   186  		return err
   187  	}
   188  	return client.Watch(c, func(ctx context.Context) error {
   189  		s.sessionLock.RLock()
   190  		defer s.sessionLock.RUnlock()
   191  		if s.session == nil {
   192  			return client.RestoreDefaults(c, false)
   193  		}
   194  		return s.session.ApplyConfig(c)
   195  	})
   196  }
   197  
   198  // ManageSessions is the counterpart to the Connect method. It reads the connectCh, creates
   199  // a session and writes a reply to the connectErrCh. The session is then started if it was
   200  // successfully created.
   201  func (s *service) ManageSessions(c context.Context) error {
   202  	wg := sync.WaitGroup{}
   203  	defer wg.Wait()
   204  
   205  	for {
   206  		// Wait for a connection request
   207  		select {
   208  		case <-c.Done():
   209  			return nil
   210  		case cr := <-s.connectRequest:
   211  			rsp := s.startSession(c, cr, &wg)
   212  			select {
   213  			case s.connectResponse <- rsp:
   214  			default:
   215  				// Nobody left to read the response? That's fine really. Just means that
   216  				// whoever wanted to start the session terminated early.
   217  				s.cancelSession()
   218  			}
   219  		}
   220  	}
   221  }
   222  
   223  func (s *service) startSession(ctx context.Context, cr *rpc.ConnectRequest, wg *sync.WaitGroup) *rpc.ConnectInfo {
   224  	s.sessionLock.Lock() // Locked during creation
   225  	defer s.sessionLock.Unlock()
   226  
   227  	if s.session != nil {
   228  		// UpdateStatus sets rpc.ConnectInfo_ALREADY_CONNECTED if successful
   229  		return s.session.UpdateStatus(s.sessionContext, cr)
   230  	}
   231  
   232  	// Obtain the kubeconfig from the request parameters so that we can determine
   233  	// what kubernetes context that will be used.
   234  	config, err := client.DaemonKubeconfig(ctx, cr)
   235  	if err != nil {
   236  		if s.rootSessionInProc {
   237  			s.quit()
   238  		}
   239  		dlog.Errorf(ctx, "Failed to obtain kubeconfig: %v", err)
   240  		return &rpc.ConnectInfo{
   241  			Error:         rpc.ConnectInfo_CLUSTER_FAILED,
   242  			ErrorText:     err.Error(),
   243  			ErrorCategory: int32(errcat.GetCategory(err)),
   244  		}
   245  	}
   246  
   247  	ctx, cancel := context.WithCancel(ctx)
   248  	ctx = userd.WithService(ctx, s.self)
   249  
   250  	daemonID, err := daemon.NewIdentifier(cr.Name, config.Context, config.Namespace, proc.RunningInContainer())
   251  	if err != nil {
   252  		cancel()
   253  		return &rpc.ConnectInfo{
   254  			Error:         rpc.ConnectInfo_CLUSTER_FAILED,
   255  			ErrorText:     err.Error(),
   256  			ErrorCategory: int32(errcat.GetCategory(err)),
   257  		}
   258  	}
   259  	go runAliveAndCancellation(ctx, cancel, daemonID)
   260  
   261  	ctx, session, rsp := userd.GetNewSessionFunc(ctx)(ctx, cr, config)
   262  	if ctx.Err() != nil || rsp.Error != rpc.ConnectInfo_UNSPECIFIED {
   263  		cancel()
   264  		if s.rootSessionInProc {
   265  			// Simplified session management. The daemon handles one session, then exits.
   266  			s.quit()
   267  		}
   268  		return rsp
   269  	}
   270  	s.session = session
   271  	s.sessionContext = userd.WithSession(ctx, session)
   272  	s.sessionCancel = func() {
   273  		cancel()
   274  		<-session.Done()
   275  	}
   276  
   277  	// Run the session asynchronously. We must be able to respond to connect (with UpdateStatus) while
   278  	// the session is running. The s.sessionCancel is called from Disconnect
   279  	wg.Add(1)
   280  	go func(cr *rpc.ConnectRequest) {
   281  		defer func() {
   282  			s.sessionLock.Lock()
   283  			s.self.SetManagerClient(nil)
   284  			s.session = nil
   285  			s.sessionCancel = nil
   286  			if err := client.RestoreDefaults(ctx, false); err != nil {
   287  				dlog.Warn(ctx, err)
   288  			}
   289  			s.sessionLock.Unlock()
   290  			wg.Done()
   291  		}()
   292  		if err := session.RunSession(s.sessionContext); err != nil {
   293  			if errors.Is(err, trafficmgr.ErrSessionExpired) {
   294  				// Session has expired. We need to cancel the owner session and reconnect
   295  				dlog.Info(ctx, "refreshing session")
   296  				s.cancelSession()
   297  				select {
   298  				case <-ctx.Done():
   299  				case s.connectRequest <- cr:
   300  				}
   301  				return
   302  			}
   303  
   304  			dlog.Error(ctx, err)
   305  		}
   306  		if s.rootSessionInProc {
   307  			// Simplified session management. The daemon handles one session, then exits.
   308  			s.quit()
   309  		}
   310  	}(cr)
   311  	return rsp
   312  }
   313  
   314  func runAliveAndCancellation(ctx context.Context, cancel context.CancelFunc, daemonID *daemon.Identifier) {
   315  	daemonInfoFile := daemonID.InfoFileName()
   316  	g := dgroup.NewGroup(ctx, dgroup.GroupConfig{})
   317  	g.Go(fmt.Sprintf("info-kicker-%s", daemonID), func(ctx context.Context) error {
   318  		// Ensure that the daemon info file is kept recent. This tells clients that we're alive.
   319  		return daemon.KeepInfoAlive(ctx, daemonInfoFile)
   320  	})
   321  	g.Go(fmt.Sprintf("info-watcher-%s", daemonID), func(ctx context.Context) error {
   322  		// Cancel the session if the daemon info file is removed.
   323  		return daemon.WatchInfos(ctx, func(ctx context.Context) error {
   324  			ok, err := daemon.InfoExists(ctx, daemonInfoFile)
   325  			if err == nil && !ok {
   326  				dlog.Debugf(ctx, "info-watcher cancels everything because daemon info %s does not exist", daemonInfoFile)
   327  				cancel()
   328  			}
   329  			return err
   330  		}, daemonInfoFile)
   331  	})
   332  	if err := g.Wait(); err != nil {
   333  		dlog.Error(ctx, err)
   334  	}
   335  }
   336  
   337  func (s *service) cancelSessionReadLocked() {
   338  	if s.sessionCancel != nil {
   339  		if err := s.session.ClearIntercepts(s.sessionContext); err != nil {
   340  			dlog.Errorf(s.sessionContext, "failed to clear intercepts: %v", err)
   341  		}
   342  		s.sessionCancel()
   343  	}
   344  }
   345  
   346  func (s *service) cancelSession() {
   347  	if !atomic.CompareAndSwapInt32(&s.sessionQuitting, 0, 1) {
   348  		return
   349  	}
   350  	s.sessionLock.RLock()
   351  	s.cancelSessionReadLocked()
   352  	s.sessionLock.RUnlock()
   353  
   354  	// We have to cancel the session before we can acquire this write-lock, because we need any long-running RPCs
   355  	// that may be holding the RLock to die.
   356  	s.sessionLock.Lock()
   357  	s.session = nil
   358  	s.sessionCancel = nil
   359  	atomic.StoreInt32(&s.sessionQuitting, 0)
   360  	s.sessionLock.Unlock()
   361  }
   362  
   363  // run is the main function when executing as the connector.
   364  func run(cmd *cobra.Command, _ []string) error {
   365  	c := cmd.Context()
   366  	cfg, err := client.LoadConfig(c)
   367  	if err != nil {
   368  		return fmt.Errorf("failed to load config: %w", err)
   369  	}
   370  	c = client.WithConfig(c, cfg)
   371  
   372  	// Listen on domain unix domain socket or windows named pipe. The listener must be opened
   373  	// before other tasks because the CLI client will only wait for a short period of time for
   374  	// the connection/socket/pipe to appear before it gives up.
   375  	var grpcListener net.Listener
   376  	flags := cmd.Flags()
   377  	if pprofPort, _ := flags.GetUint16(pprofFlag); pprofPort > 0 {
   378  		go func() {
   379  			if err := pprof.PprofServer(c, pprofPort); err != nil {
   380  				dlog.Error(c, err)
   381  			}
   382  		}()
   383  	}
   384  
   385  	name, _ := flags.GetString(nameFlag)
   386  	sessionName := "session"
   387  	if di := strings.IndexByte(name, '-'); di > 0 {
   388  		sessionName = name[di+1:]
   389  		name = name[:di]
   390  	}
   391  	c = dgroup.WithGoroutineName(c, "/"+name)
   392  	c, err = logging.InitContext(c, userd.ProcessName, logging.RotateDaily, true)
   393  	if err != nil {
   394  		return err
   395  	}
   396  	rootSessionInProc, _ := flags.GetBool(embedNetworkFlag)
   397  	var daemonAddress *net.TCPAddr
   398  	if addr, _ := flags.GetString(addressFlag); addr != "" {
   399  		lc := net.ListenConfig{}
   400  		if grpcListener, err = lc.Listen(c, "tcp", addr); err != nil {
   401  			return err
   402  		}
   403  		daemonAddress = grpcListener.Addr().(*net.TCPAddr)
   404  		defer func() {
   405  			_ = grpcListener.Close()
   406  		}()
   407  	} else {
   408  		socketPath := socket.UserDaemonPath(c)
   409  		dlog.Infof(c, "Starting socket listener for %s", socketPath)
   410  		if grpcListener, err = socket.Listen(c, userd.ProcessName, socketPath); err != nil {
   411  			dlog.Errorf(c, "socket listener for %s failed: %v", socketPath, err)
   412  			return err
   413  		}
   414  		defer func() {
   415  			_ = socket.Remove(grpcListener)
   416  		}()
   417  	}
   418  	dlog.Debugf(c, "Listener opened on %s", grpcListener.Addr())
   419  
   420  	dlog.Info(c, "---")
   421  	dlog.Infof(c, "Telepresence %s %s starting...", titleName, client.DisplayVersion())
   422  	dlog.Infof(c, "PID is %d", os.Getpid())
   423  	dlog.Info(c, "")
   424  
   425  	// Don't bother calling 'conn.Close()', it should remain open until we shut down, and just
   426  	// prefer to let the OS close it when we exit.
   427  
   428  	c = scout.NewReporter(c, "connector")
   429  	g := dgroup.NewGroup(c, dgroup.GroupConfig{
   430  		SoftShutdownTimeout:  2 * time.Second,
   431  		EnableSignalHandling: true,
   432  		ShutdownOnNonError:   true,
   433  	})
   434  
   435  	// Start services from within a group routine so that it gets proper cancellation
   436  	// when the group is cancelled.
   437  	siCh := make(chan userd.Service)
   438  	g.Go("service", func(c context.Context) error {
   439  		opts := []grpc.ServerOption{
   440  			grpc.StatsHandler(otelgrpc.NewServerHandler()),
   441  		}
   442  		if mz := cfg.Grpc().MaxReceiveSize(); mz > 0 {
   443  			opts = append(opts, grpc.MaxRecvMsgSize(int(mz)))
   444  		}
   445  		si, err := userd.GetNewServiceFunc(c)(c, g, cfg, grpc.NewServer(opts...))
   446  		if err != nil {
   447  			close(siCh)
   448  			return err
   449  		}
   450  		siCh <- si
   451  		close(siCh)
   452  
   453  		<-c.Done() // wait for context cancellation
   454  		return nil
   455  	})
   456  
   457  	si, ok := <-siCh
   458  	if !ok {
   459  		// Return error from the "service" go routine
   460  		return g.Wait()
   461  	}
   462  
   463  	var s *service
   464  	si.As(&s)
   465  	s.rootSessionInProc = rootSessionInProc
   466  	s.daemonAddress = daemonAddress
   467  
   468  	if err := logging.LoadTimedLevelFromCache(c, s.timedLogLevel, userd.ProcessName); err != nil {
   469  		return err
   470  	}
   471  
   472  	if cfg.Intercept().UseFtp {
   473  		g.Go("fuseftp-server", func(c context.Context) error {
   474  			if err := s.fuseFtpMgr.DeferInit(c); err != nil {
   475  				dlog.Error(c, err)
   476  			}
   477  			<-c.Done()
   478  			return nil
   479  		})
   480  	}
   481  
   482  	g.Go("server-grpc", func(c context.Context) (err error) {
   483  		sc := &dhttp.ServerConfig{Handler: s.srv}
   484  		dlog.Info(c, "gRPC server started")
   485  		if err = sc.Serve(c, grpcListener); err != nil && c.Err() != nil {
   486  			err = nil // Normal shutdown
   487  		}
   488  		if err != nil {
   489  			dlog.Errorf(c, "gRPC server ended with: %v", err)
   490  		} else {
   491  			dlog.Debug(c, "gRPC server ended")
   492  		}
   493  		return err
   494  	})
   495  
   496  	g.Go("config-reload", s.configReload)
   497  	g.Go(sessionName, func(c context.Context) error {
   498  		c, cancel := context.WithCancel(c)
   499  		s.quit = func() {
   500  			if !s.quitDisable {
   501  				cancel()
   502  			}
   503  		}
   504  		return s.ManageSessions(c)
   505  	})
   506  
   507  	// background-metriton is the goroutine that handles all telemetry reports, so that calls to
   508  	// metriton don't block the functional goroutines.
   509  	g.Go("background-metriton", scout.Run)
   510  
   511  	err = g.Wait()
   512  	if err != nil {
   513  		dlog.Error(c, err)
   514  	}
   515  	return err
   516  }