github.com/telepresenceio/telepresence/v2@v2.20.0-pro.6.0.20240517030216-236ea954e789/pkg/client/userd/daemon/service.go (about) 1 package daemon 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net" 8 "os" 9 "path/filepath" 10 "strings" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 "github.com/spf13/cobra" 16 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" 17 "google.golang.org/grpc" 18 19 "github.com/datawire/dlib/dgroup" 20 "github.com/datawire/dlib/dhttp" 21 "github.com/datawire/dlib/dlog" 22 "github.com/telepresenceio/telepresence/rpc/v2/common" 23 rpc "github.com/telepresenceio/telepresence/rpc/v2/connector" 24 "github.com/telepresenceio/telepresence/rpc/v2/manager" 25 "github.com/telepresenceio/telepresence/v2/pkg/client" 26 "github.com/telepresenceio/telepresence/v2/pkg/client/cli/daemon" 27 "github.com/telepresenceio/telepresence/v2/pkg/client/logging" 28 "github.com/telepresenceio/telepresence/v2/pkg/client/remotefs" 29 "github.com/telepresenceio/telepresence/v2/pkg/client/scout" 30 "github.com/telepresenceio/telepresence/v2/pkg/client/socket" 31 "github.com/telepresenceio/telepresence/v2/pkg/client/userd" 32 "github.com/telepresenceio/telepresence/v2/pkg/client/userd/trafficmgr" 33 "github.com/telepresenceio/telepresence/v2/pkg/errcat" 34 "github.com/telepresenceio/telepresence/v2/pkg/filelocation" 35 "github.com/telepresenceio/telepresence/v2/pkg/log" 36 "github.com/telepresenceio/telepresence/v2/pkg/pprof" 37 "github.com/telepresenceio/telepresence/v2/pkg/proc" 38 "github.com/telepresenceio/telepresence/v2/pkg/tracing" 39 ) 40 41 const titleName = "Connector" 42 43 func help() string { 44 return `The Telepresence ` + titleName + ` is a background component that manages a connection. 45 46 Launch the Telepresence ` + titleName + `: 47 telepresence connect 48 49 Examine the ` + titleName + `'s log output in 50 ` + filepath.Join(filelocation.AppUserLogDir(context.Background()), userd.ProcessName+".log") + ` 51 to troubleshoot problems. 52 ` 53 } 54 55 // service represents the long-running state of the Telepresence User Daemon. 56 type service struct { 57 rpc.UnsafeConnectorServer 58 srv *grpc.Server 59 managerProxy *mgrProxy 60 timedLogLevel log.TimedLevel 61 ucn int64 62 fuseFTPError error 63 64 // The quit function that quits the server. 65 quit func() 66 67 // quitDisable will temporarily disable the quit function. This is used when there's a desire 68 // to cancel the session without cancelling the process although the simplified session management 69 // is in effect (rootSessionInProc == true). 70 quitDisable bool 71 72 session userd.Session 73 sessionCancel context.CancelFunc 74 sessionContext context.Context 75 sessionQuitting int32 // atomic boolean. True if non-zero. 76 sessionLock sync.RWMutex 77 78 // These are used to communicate between the various goroutines. 79 connectRequest chan *rpc.ConnectRequest // server-grpc.connect() -> connectWorker 80 connectResponse chan *rpc.ConnectInfo // connectWorker -> server-grpc.connect() 81 82 fuseFtpMgr remotefs.FuseFTPManager 83 84 // Run root session in-process 85 rootSessionInProc bool 86 87 // The TCP address that the daemon listens to. Will be nil if the daemon listens to a unix socket. 88 daemonAddress *net.TCPAddr 89 90 // Possibly extended version of the service. Use when calling interface methods. 91 self userd.Service 92 } 93 94 func NewService(ctx context.Context, _ *dgroup.Group, cfg client.Config, srv *grpc.Server) (userd.Service, error) { 95 s := &service{ 96 srv: srv, 97 connectRequest: make(chan *rpc.ConnectRequest), 98 connectResponse: make(chan *rpc.ConnectInfo), 99 managerProxy: &mgrProxy{}, 100 timedLogLevel: log.NewTimedLevel(cfg.LogLevels().UserDaemon.String(), log.SetLevel), 101 fuseFtpMgr: remotefs.NewFuseFTPManager(), 102 } 103 s.self = s 104 if srv != nil { 105 // The podd daemon never registers the gRPC servers 106 rpc.RegisterConnectorServer(srv, s) 107 rpc.RegisterManagerProxyServer(srv, s.managerProxy) 108 tracer, err := tracing.NewTraceServer(ctx, "user-daemon") 109 if err != nil { 110 return nil, err 111 } 112 common.RegisterTracingServer(srv, tracer) 113 } else { 114 s.rootSessionInProc = true 115 s.quit = func() {} 116 } 117 return s, nil 118 } 119 120 func (s *service) As(ptr any) { 121 switch ptr := ptr.(type) { 122 case **service: 123 *ptr = s 124 case *rpc.ConnectorServer: 125 *ptr = s 126 default: 127 panic(fmt.Sprintf("%T does not implement %T", s, ptr)) 128 } 129 } 130 131 func (s *service) ListenerAddress(ctx context.Context) string { 132 if s.daemonAddress != nil { 133 return s.daemonAddress.String() 134 } 135 return "unix:" + socket.UserDaemonPath(ctx) 136 } 137 138 func (s *service) SetSelf(self userd.Service) { 139 s.self = self 140 } 141 142 func (s *service) FuseFTPMgr() remotefs.FuseFTPManager { 143 return s.fuseFtpMgr 144 } 145 146 func (s *service) RootSessionInProcess() bool { 147 return s.rootSessionInProc 148 } 149 150 func (s *service) Server() *grpc.Server { 151 return s.srv 152 } 153 154 func (s *service) SetManagerClient(managerClient manager.ManagerClient, callOptions ...grpc.CallOption) { 155 s.managerProxy.setClient(managerClient, callOptions...) 156 } 157 158 const ( 159 nameFlag = "name" 160 addressFlag = "address" 161 embedNetworkFlag = "embed-network" 162 pprofFlag = "pprof" 163 ) 164 165 // Command returns the CLI sub-command for "connector-foreground". 166 func Command() *cobra.Command { 167 c := &cobra.Command{ 168 Use: userd.ProcessName + "-foreground", 169 Short: "Launch Telepresence " + titleName + " in the foreground (debug)", 170 Args: cobra.ExactArgs(0), 171 Hidden: true, 172 Long: help(), 173 RunE: run, 174 } 175 flags := c.Flags() 176 flags.String(nameFlag, userd.ProcessName, "Daemon name") 177 flags.String(addressFlag, "", "Address to listen to. Defaults to "+socket.UserDaemonPath(context.Background())) 178 flags.Bool(embedNetworkFlag, false, "Embed network functionality in the user daemon. Requires capability NET_ADMIN") 179 flags.Uint16(pprofFlag, 0, "start pprof server on the given port") 180 return c 181 } 182 183 func (s *service) configReload(c context.Context) error { 184 // Ensure that the directory to watch exists. 185 if err := os.MkdirAll(filepath.Dir(client.GetConfigFile(c)), 0o755); err != nil { 186 return err 187 } 188 return client.Watch(c, func(ctx context.Context) error { 189 s.sessionLock.RLock() 190 defer s.sessionLock.RUnlock() 191 if s.session == nil { 192 return client.RestoreDefaults(c, false) 193 } 194 return s.session.ApplyConfig(c) 195 }) 196 } 197 198 // ManageSessions is the counterpart to the Connect method. It reads the connectCh, creates 199 // a session and writes a reply to the connectErrCh. The session is then started if it was 200 // successfully created. 201 func (s *service) ManageSessions(c context.Context) error { 202 wg := sync.WaitGroup{} 203 defer wg.Wait() 204 205 for { 206 // Wait for a connection request 207 select { 208 case <-c.Done(): 209 return nil 210 case cr := <-s.connectRequest: 211 rsp := s.startSession(c, cr, &wg) 212 select { 213 case s.connectResponse <- rsp: 214 default: 215 // Nobody left to read the response? That's fine really. Just means that 216 // whoever wanted to start the session terminated early. 217 s.cancelSession() 218 } 219 } 220 } 221 } 222 223 func (s *service) startSession(ctx context.Context, cr *rpc.ConnectRequest, wg *sync.WaitGroup) *rpc.ConnectInfo { 224 s.sessionLock.Lock() // Locked during creation 225 defer s.sessionLock.Unlock() 226 227 if s.session != nil { 228 // UpdateStatus sets rpc.ConnectInfo_ALREADY_CONNECTED if successful 229 return s.session.UpdateStatus(s.sessionContext, cr) 230 } 231 232 // Obtain the kubeconfig from the request parameters so that we can determine 233 // what kubernetes context that will be used. 234 config, err := client.DaemonKubeconfig(ctx, cr) 235 if err != nil { 236 if s.rootSessionInProc { 237 s.quit() 238 } 239 dlog.Errorf(ctx, "Failed to obtain kubeconfig: %v", err) 240 return &rpc.ConnectInfo{ 241 Error: rpc.ConnectInfo_CLUSTER_FAILED, 242 ErrorText: err.Error(), 243 ErrorCategory: int32(errcat.GetCategory(err)), 244 } 245 } 246 247 ctx, cancel := context.WithCancel(ctx) 248 ctx = userd.WithService(ctx, s.self) 249 250 daemonID, err := daemon.NewIdentifier(cr.Name, config.Context, config.Namespace, proc.RunningInContainer()) 251 if err != nil { 252 cancel() 253 return &rpc.ConnectInfo{ 254 Error: rpc.ConnectInfo_CLUSTER_FAILED, 255 ErrorText: err.Error(), 256 ErrorCategory: int32(errcat.GetCategory(err)), 257 } 258 } 259 go runAliveAndCancellation(ctx, cancel, daemonID) 260 261 ctx, session, rsp := userd.GetNewSessionFunc(ctx)(ctx, cr, config) 262 if ctx.Err() != nil || rsp.Error != rpc.ConnectInfo_UNSPECIFIED { 263 cancel() 264 if s.rootSessionInProc { 265 // Simplified session management. The daemon handles one session, then exits. 266 s.quit() 267 } 268 return rsp 269 } 270 s.session = session 271 s.sessionContext = userd.WithSession(ctx, session) 272 s.sessionCancel = func() { 273 cancel() 274 <-session.Done() 275 } 276 277 // Run the session asynchronously. We must be able to respond to connect (with UpdateStatus) while 278 // the session is running. The s.sessionCancel is called from Disconnect 279 wg.Add(1) 280 go func(cr *rpc.ConnectRequest) { 281 defer func() { 282 s.sessionLock.Lock() 283 s.self.SetManagerClient(nil) 284 s.session = nil 285 s.sessionCancel = nil 286 if err := client.RestoreDefaults(ctx, false); err != nil { 287 dlog.Warn(ctx, err) 288 } 289 s.sessionLock.Unlock() 290 wg.Done() 291 }() 292 if err := session.RunSession(s.sessionContext); err != nil { 293 if errors.Is(err, trafficmgr.ErrSessionExpired) { 294 // Session has expired. We need to cancel the owner session and reconnect 295 dlog.Info(ctx, "refreshing session") 296 s.cancelSession() 297 select { 298 case <-ctx.Done(): 299 case s.connectRequest <- cr: 300 } 301 return 302 } 303 304 dlog.Error(ctx, err) 305 } 306 if s.rootSessionInProc { 307 // Simplified session management. The daemon handles one session, then exits. 308 s.quit() 309 } 310 }(cr) 311 return rsp 312 } 313 314 func runAliveAndCancellation(ctx context.Context, cancel context.CancelFunc, daemonID *daemon.Identifier) { 315 daemonInfoFile := daemonID.InfoFileName() 316 g := dgroup.NewGroup(ctx, dgroup.GroupConfig{}) 317 g.Go(fmt.Sprintf("info-kicker-%s", daemonID), func(ctx context.Context) error { 318 // Ensure that the daemon info file is kept recent. This tells clients that we're alive. 319 return daemon.KeepInfoAlive(ctx, daemonInfoFile) 320 }) 321 g.Go(fmt.Sprintf("info-watcher-%s", daemonID), func(ctx context.Context) error { 322 // Cancel the session if the daemon info file is removed. 323 return daemon.WatchInfos(ctx, func(ctx context.Context) error { 324 ok, err := daemon.InfoExists(ctx, daemonInfoFile) 325 if err == nil && !ok { 326 dlog.Debugf(ctx, "info-watcher cancels everything because daemon info %s does not exist", daemonInfoFile) 327 cancel() 328 } 329 return err 330 }, daemonInfoFile) 331 }) 332 if err := g.Wait(); err != nil { 333 dlog.Error(ctx, err) 334 } 335 } 336 337 func (s *service) cancelSessionReadLocked() { 338 if s.sessionCancel != nil { 339 if err := s.session.ClearIntercepts(s.sessionContext); err != nil { 340 dlog.Errorf(s.sessionContext, "failed to clear intercepts: %v", err) 341 } 342 s.sessionCancel() 343 } 344 } 345 346 func (s *service) cancelSession() { 347 if !atomic.CompareAndSwapInt32(&s.sessionQuitting, 0, 1) { 348 return 349 } 350 s.sessionLock.RLock() 351 s.cancelSessionReadLocked() 352 s.sessionLock.RUnlock() 353 354 // We have to cancel the session before we can acquire this write-lock, because we need any long-running RPCs 355 // that may be holding the RLock to die. 356 s.sessionLock.Lock() 357 s.session = nil 358 s.sessionCancel = nil 359 atomic.StoreInt32(&s.sessionQuitting, 0) 360 s.sessionLock.Unlock() 361 } 362 363 // run is the main function when executing as the connector. 364 func run(cmd *cobra.Command, _ []string) error { 365 c := cmd.Context() 366 cfg, err := client.LoadConfig(c) 367 if err != nil { 368 return fmt.Errorf("failed to load config: %w", err) 369 } 370 c = client.WithConfig(c, cfg) 371 372 // Listen on domain unix domain socket or windows named pipe. The listener must be opened 373 // before other tasks because the CLI client will only wait for a short period of time for 374 // the connection/socket/pipe to appear before it gives up. 375 var grpcListener net.Listener 376 flags := cmd.Flags() 377 if pprofPort, _ := flags.GetUint16(pprofFlag); pprofPort > 0 { 378 go func() { 379 if err := pprof.PprofServer(c, pprofPort); err != nil { 380 dlog.Error(c, err) 381 } 382 }() 383 } 384 385 name, _ := flags.GetString(nameFlag) 386 sessionName := "session" 387 if di := strings.IndexByte(name, '-'); di > 0 { 388 sessionName = name[di+1:] 389 name = name[:di] 390 } 391 c = dgroup.WithGoroutineName(c, "/"+name) 392 c, err = logging.InitContext(c, userd.ProcessName, logging.RotateDaily, true) 393 if err != nil { 394 return err 395 } 396 rootSessionInProc, _ := flags.GetBool(embedNetworkFlag) 397 var daemonAddress *net.TCPAddr 398 if addr, _ := flags.GetString(addressFlag); addr != "" { 399 lc := net.ListenConfig{} 400 if grpcListener, err = lc.Listen(c, "tcp", addr); err != nil { 401 return err 402 } 403 daemonAddress = grpcListener.Addr().(*net.TCPAddr) 404 defer func() { 405 _ = grpcListener.Close() 406 }() 407 } else { 408 socketPath := socket.UserDaemonPath(c) 409 dlog.Infof(c, "Starting socket listener for %s", socketPath) 410 if grpcListener, err = socket.Listen(c, userd.ProcessName, socketPath); err != nil { 411 dlog.Errorf(c, "socket listener for %s failed: %v", socketPath, err) 412 return err 413 } 414 defer func() { 415 _ = socket.Remove(grpcListener) 416 }() 417 } 418 dlog.Debugf(c, "Listener opened on %s", grpcListener.Addr()) 419 420 dlog.Info(c, "---") 421 dlog.Infof(c, "Telepresence %s %s starting...", titleName, client.DisplayVersion()) 422 dlog.Infof(c, "PID is %d", os.Getpid()) 423 dlog.Info(c, "") 424 425 // Don't bother calling 'conn.Close()', it should remain open until we shut down, and just 426 // prefer to let the OS close it when we exit. 427 428 c = scout.NewReporter(c, "connector") 429 g := dgroup.NewGroup(c, dgroup.GroupConfig{ 430 SoftShutdownTimeout: 2 * time.Second, 431 EnableSignalHandling: true, 432 ShutdownOnNonError: true, 433 }) 434 435 // Start services from within a group routine so that it gets proper cancellation 436 // when the group is cancelled. 437 siCh := make(chan userd.Service) 438 g.Go("service", func(c context.Context) error { 439 opts := []grpc.ServerOption{ 440 grpc.StatsHandler(otelgrpc.NewServerHandler()), 441 } 442 if mz := cfg.Grpc().MaxReceiveSize(); mz > 0 { 443 opts = append(opts, grpc.MaxRecvMsgSize(int(mz))) 444 } 445 si, err := userd.GetNewServiceFunc(c)(c, g, cfg, grpc.NewServer(opts...)) 446 if err != nil { 447 close(siCh) 448 return err 449 } 450 siCh <- si 451 close(siCh) 452 453 <-c.Done() // wait for context cancellation 454 return nil 455 }) 456 457 si, ok := <-siCh 458 if !ok { 459 // Return error from the "service" go routine 460 return g.Wait() 461 } 462 463 var s *service 464 si.As(&s) 465 s.rootSessionInProc = rootSessionInProc 466 s.daemonAddress = daemonAddress 467 468 if err := logging.LoadTimedLevelFromCache(c, s.timedLogLevel, userd.ProcessName); err != nil { 469 return err 470 } 471 472 if cfg.Intercept().UseFtp { 473 g.Go("fuseftp-server", func(c context.Context) error { 474 if err := s.fuseFtpMgr.DeferInit(c); err != nil { 475 dlog.Error(c, err) 476 } 477 <-c.Done() 478 return nil 479 }) 480 } 481 482 g.Go("server-grpc", func(c context.Context) (err error) { 483 sc := &dhttp.ServerConfig{Handler: s.srv} 484 dlog.Info(c, "gRPC server started") 485 if err = sc.Serve(c, grpcListener); err != nil && c.Err() != nil { 486 err = nil // Normal shutdown 487 } 488 if err != nil { 489 dlog.Errorf(c, "gRPC server ended with: %v", err) 490 } else { 491 dlog.Debug(c, "gRPC server ended") 492 } 493 return err 494 }) 495 496 g.Go("config-reload", s.configReload) 497 g.Go(sessionName, func(c context.Context) error { 498 c, cancel := context.WithCancel(c) 499 s.quit = func() { 500 if !s.quitDisable { 501 cancel() 502 } 503 } 504 return s.ManageSessions(c) 505 }) 506 507 // background-metriton is the goroutine that handles all telemetry reports, so that calls to 508 // metriton don't block the functional goroutines. 509 g.Go("background-metriton", scout.Run) 510 511 err = g.Wait() 512 if err != nil { 513 dlog.Error(c, err) 514 } 515 return err 516 }