github.com/telepresenceio/telepresence/v2@v2.20.0-pro.6.0.20240517030216-236ea954e789/pkg/client/userd/trafficmgr/session.go (about)

     1  package trafficmgr
     2  
     3  import (
     4  	"context"
     5  	"encoding/json"
     6  	"errors"
     7  	"fmt"
     8  	"net"
     9  	"net/http"
    10  	"net/url"
    11  	"os"
    12  	"os/user"
    13  	"slices"
    14  	"sort"
    15  	"strings"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/blang/semver/v4"
    20  	"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
    21  	"google.golang.org/grpc"
    22  	"google.golang.org/grpc/codes"
    23  	"google.golang.org/grpc/status"
    24  	"google.golang.org/protobuf/types/known/durationpb"
    25  	empty "google.golang.org/protobuf/types/known/emptypb"
    26  	"gopkg.in/yaml.v3"
    27  	core "k8s.io/api/core/v1"
    28  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    29  	meta "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/types"
    31  	"k8s.io/client-go/util/homedir"
    32  
    33  	"github.com/datawire/dlib/dcontext"
    34  	"github.com/datawire/dlib/dgroup"
    35  	"github.com/datawire/dlib/dlog"
    36  	"github.com/datawire/dlib/dtime"
    37  	"github.com/datawire/k8sapi/pkg/k8sapi"
    38  	"github.com/telepresenceio/telepresence/rpc/v2/authenticator"
    39  	"github.com/telepresenceio/telepresence/rpc/v2/common"
    40  	"github.com/telepresenceio/telepresence/rpc/v2/connector"
    41  	rpc "github.com/telepresenceio/telepresence/rpc/v2/connector"
    42  	rootdRpc "github.com/telepresenceio/telepresence/rpc/v2/daemon"
    43  	"github.com/telepresenceio/telepresence/rpc/v2/manager"
    44  	"github.com/telepresenceio/telepresence/v2/pkg/agentconfig"
    45  	authGrpc "github.com/telepresenceio/telepresence/v2/pkg/authenticator/grpc"
    46  	"github.com/telepresenceio/telepresence/v2/pkg/authenticator/patcher"
    47  	"github.com/telepresenceio/telepresence/v2/pkg/client"
    48  	"github.com/telepresenceio/telepresence/v2/pkg/client/cli/daemon"
    49  	"github.com/telepresenceio/telepresence/v2/pkg/client/k8sclient"
    50  	"github.com/telepresenceio/telepresence/v2/pkg/client/rootd"
    51  	"github.com/telepresenceio/telepresence/v2/pkg/client/scout"
    52  	"github.com/telepresenceio/telepresence/v2/pkg/client/socket"
    53  	"github.com/telepresenceio/telepresence/v2/pkg/client/userd"
    54  	"github.com/telepresenceio/telepresence/v2/pkg/client/userd/k8s"
    55  	"github.com/telepresenceio/telepresence/v2/pkg/dnet"
    56  	"github.com/telepresenceio/telepresence/v2/pkg/errcat"
    57  	"github.com/telepresenceio/telepresence/v2/pkg/iputil"
    58  	"github.com/telepresenceio/telepresence/v2/pkg/matcher"
    59  	"github.com/telepresenceio/telepresence/v2/pkg/proc"
    60  	"github.com/telepresenceio/telepresence/v2/pkg/restapi"
    61  )
    62  
    63  type apiServer struct {
    64  	restapi.Server
    65  	cancel context.CancelFunc
    66  }
    67  
    68  type apiMatcher struct {
    69  	requestMatcher matcher.Request
    70  	metadata       map[string]string
    71  }
    72  
    73  type session struct {
    74  	*k8s.Cluster
    75  	rootDaemon         rootdRpc.DaemonClient
    76  	subnetViaWorkloads []*rootdRpc.SubnetViaWorkload
    77  
    78  	// local information
    79  	installID   string // telepresence's install ID
    80  	userAndHost string // "laptop-username@laptop-hostname"
    81  
    82  	// Kubernetes Port Forward Dialer
    83  	pfDialer dnet.PortForwardDialer
    84  
    85  	// manager client
    86  	managerClient manager.ManagerClient
    87  
    88  	// manager client connection
    89  	managerConn *grpc.ClientConn
    90  
    91  	// name reported by the manager
    92  	managerName string
    93  
    94  	// version reported by the manager
    95  	managerVersion semver.Version
    96  
    97  	// The identifier for this daemon
    98  	daemonID *daemon.Identifier
    99  
   100  	sessionInfo *manager.SessionInfo // sessionInfo returned by the traffic-manager
   101  
   102  	wlWatcher *workloadsAndServicesWatcher
   103  
   104  	// currentInterceptsLock ensures that all accesses to currentIntercepts, currentMatchers,
   105  	// currentAPIServers, interceptWaiters, and ingressInfo are synchronized
   106  	//
   107  	currentInterceptsLock sync.Mutex
   108  
   109  	// currentIntercepts is the latest snapshot returned by the intercept watcher. It
   110  	// is keyeed by the intercept ID
   111  	currentIntercepts map[string]*intercept
   112  
   113  	// currentMatches hold the matchers used when using the APIServer.
   114  	currentMatchers map[string]*apiMatcher
   115  
   116  	// currentAPIServers contains the APIServer in use. Typically zero or only one, but since the
   117  	// port is determined by the intercept, there might theoretically be serveral.
   118  	currentAPIServers map[int]*apiServer
   119  
   120  	// Map of desired awaited intercepts. Keyed by intercept name, because it
   121  	// is filled in prior to the intercept being created. Entries are short lived. They
   122  	// are deleted as soon as the intercept arrives and gets stored in currentIntercepts
   123  	interceptWaiters map[string]*awaitIntercept
   124  
   125  	ingressInfo []*manager.IngressInfo
   126  
   127  	isPodDaemon bool
   128  
   129  	sessionConfig client.Config
   130  
   131  	// done is closed when the session ends
   132  	done chan struct{}
   133  
   134  	// Possibly extended version of the session. Use when calling interface methods.
   135  	self userd.Session
   136  }
   137  
   138  func NewSession(
   139  	ctx context.Context,
   140  	cr *rpc.ConnectRequest,
   141  	config *client.Kubeconfig,
   142  ) (_ context.Context, _ userd.Session, info *connector.ConnectInfo) {
   143  	dlog.Info(ctx, "-- Starting new session")
   144  
   145  	connectStart := time.Now()
   146  	defer func() {
   147  		if info.Error == connector.ConnectInfo_UNSPECIFIED {
   148  			scout.Report(ctx, "connect",
   149  				scout.Entry{
   150  					Key:   "time_to_connect",
   151  					Value: time.Since(connectStart).Seconds(),
   152  				}, scout.Entry{
   153  					Key:   "mapped_namespaces",
   154  					Value: len(cr.MappedNamespaces),
   155  				})
   156  		} else {
   157  			scout.Report(ctx, "connect_error",
   158  				scout.Entry{
   159  					Key:   "error",
   160  					Value: info.ErrorText,
   161  				}, scout.Entry{
   162  					Key:   "error_type",
   163  					Value: info.Error.String(),
   164  				}, scout.Entry{
   165  					Key:   "error_category",
   166  					Value: info.ErrorCategory,
   167  				}, scout.Entry{
   168  					Key:   "time_to_fail",
   169  					Value: time.Since(connectStart).Seconds(),
   170  				}, scout.Entry{
   171  					Key:   "mapped_namespaces",
   172  					Value: len(cr.MappedNamespaces),
   173  				})
   174  		}
   175  	}()
   176  
   177  	dlog.Info(ctx, "Connecting to k8s cluster...")
   178  	cluster, err := k8s.ConnectCluster(ctx, cr, config)
   179  	if err != nil {
   180  		dlog.Errorf(ctx, "unable to track k8s cluster: %+v", err)
   181  		return ctx, nil, connectError(rpc.ConnectInfo_CLUSTER_FAILED, err)
   182  	}
   183  	dlog.Infof(ctx, "Connected to context %s, namespace %s (%s)", cluster.Context, cluster.Namespace, cluster.Server)
   184  
   185  	ctx = cluster.WithK8sInterface(ctx)
   186  	scout.SetMetadatum(ctx, "cluster_id", cluster.GetClusterId(ctx))
   187  
   188  	dlog.Info(ctx, "Connecting to traffic manager...")
   189  	tmgr, err := connectMgr(ctx, cluster, scout.InstallID(ctx), cr)
   190  	if err != nil {
   191  		dlog.Errorf(ctx, "Unable to connect to session: %s", err)
   192  		return ctx, nil, connectError(rpc.ConnectInfo_TRAFFIC_MANAGER_FAILED, err)
   193  	}
   194  
   195  	// store session in ctx for reporting
   196  	ctx = scout.WithSession(ctx, tmgr)
   197  
   198  	tmgr.sessionConfig = client.GetDefaultConfig()
   199  	cliCfg, err := tmgr.managerClient.GetClientConfig(ctx, &empty.Empty{})
   200  	if err != nil {
   201  		if status.Code(err) != codes.Unimplemented {
   202  			dlog.Warnf(ctx, "Failed to get remote config from traffic manager: %v", err)
   203  		}
   204  	} else {
   205  		if err := yaml.Unmarshal(cliCfg.ConfigYaml, tmgr.sessionConfig); err != nil {
   206  			dlog.Warnf(ctx, "Failed to deserialize remote config: %v", err)
   207  		}
   208  		if err := tmgr.ApplyConfig(ctx); err != nil {
   209  			dlog.Warnf(ctx, "failed to apply config from traffic-manager: %v", err)
   210  		}
   211  		if err := cluster.AddRemoteKubeConfigExtension(ctx, cliCfg.ConfigYaml); err != nil {
   212  			dlog.Warnf(ctx, "Failed to set remote kubeconfig values: %v", err)
   213  		}
   214  	}
   215  	ctx = dnet.WithPortForwardDialer(ctx, tmgr.pfDialer)
   216  
   217  	oi := tmgr.getOutboundInfo(ctx, cr)
   218  	if !userd.GetService(ctx).RootSessionInProcess() {
   219  		// Connect to the root daemon if it is running. It's the CLI that starts it initially
   220  		rootRunning, err := socket.IsRunning(ctx, socket.RootDaemonPath(ctx))
   221  		if err != nil {
   222  			return ctx, nil, connectError(rpc.ConnectInfo_DAEMON_FAILED, err)
   223  		}
   224  		if !rootRunning {
   225  			return ctx, nil, connectError(rpc.ConnectInfo_DAEMON_FAILED, errors.New("rot daemon is not running"))
   226  		}
   227  
   228  		if client.GetConfig(ctx).Cluster().ConnectFromRootDaemon {
   229  			// Root daemon needs this to authenticate with the cluster. Potential exec configurations in the kubeconfig
   230  			// must be executed by the user, not by root.
   231  			konfig, err := patcher.CreateExternalKubeConfig(ctx, config.ClientConfig, cluster.Context, func([]string) (string, string, error) {
   232  				s := userd.GetService(ctx)
   233  				if _, ok := s.Server().GetServiceInfo()[authenticator.Authenticator_ServiceDesc.ServiceName]; !ok {
   234  					authGrpc.RegisterAuthenticatorServer(s.Server(), config.ClientConfig)
   235  				}
   236  				return client.GetExe(ctx), s.ListenerAddress(ctx), nil
   237  			}, nil)
   238  			if err != nil {
   239  				return ctx, nil, connectError(rpc.ConnectInfo_DAEMON_FAILED, err)
   240  			}
   241  			patcher.AnnotateOutboundInfo(ctx, oi, konfig.CurrentContext)
   242  		}
   243  	}
   244  
   245  	tmgr.rootDaemon, err = tmgr.connectRootDaemon(ctx, oi, cr.IsPodDaemon)
   246  	if err != nil {
   247  		tmgr.managerConn.Close()
   248  		return ctx, nil, connectError(rpc.ConnectInfo_DAEMON_FAILED, err)
   249  	}
   250  	if err != nil {
   251  		return ctx, nil, connectError(rpc.ConnectInfo_DAEMON_FAILED, err)
   252  	}
   253  
   254  	// Collect data on how long connection time took
   255  	dlog.Debug(ctx, "Finished connecting to traffic manager")
   256  
   257  	tmgr.AddNamespaceListener(ctx, tmgr.updateDaemonNamespaces)
   258  	return ctx, tmgr, tmgr.status(ctx, true)
   259  }
   260  
   261  // SetSelf is for internal use by extensions.
   262  func (s *session) SetSelf(self userd.Session) {
   263  	s.self = self
   264  }
   265  
   266  // RunSession (1) starts up with ensuring that the manager is installed and running,
   267  // but then for most of its life
   268  //   - (2) calls manager.ArriveAsClient and then periodically calls manager.Remain
   269  //   - run the intercepts (manager.WatchIntercepts) and then
   270  //   - (3) listen on the appropriate local ports and forward them to the intercepted
   271  //     Services, and
   272  //   - (4) mount the appropriate remote volumes.
   273  func (s *session) RunSession(c context.Context) error {
   274  	self := s.self
   275  	g := dgroup.NewGroup(c, dgroup.GroupConfig{})
   276  	defer func() {
   277  		self.Epilog(c)
   278  	}()
   279  	self.StartServices(g)
   280  	return g.Wait()
   281  }
   282  
   283  func (s *session) RootDaemon() rootdRpc.DaemonClient {
   284  	return s.rootDaemon
   285  }
   286  
   287  func (s *session) ManagerClient() manager.ManagerClient {
   288  	return s.managerClient
   289  }
   290  
   291  func (s *session) ManagerConn() *grpc.ClientConn {
   292  	return s.managerConn
   293  }
   294  
   295  func (s *session) ManagerName() string {
   296  	return s.managerName
   297  }
   298  
   299  func (s *session) ManagerVersion() semver.Version {
   300  	return s.managerVersion
   301  }
   302  
   303  func (s *session) getSessionConfig() client.Config {
   304  	return s.sessionConfig
   305  }
   306  
   307  // connectMgr returns a session for the given cluster that is connected to the traffic-manager.
   308  func connectMgr(
   309  	ctx context.Context,
   310  	cluster *k8s.Cluster,
   311  	installID string,
   312  	cr *rpc.ConnectRequest,
   313  ) (*session, error) {
   314  	tos := client.GetConfig(ctx).Timeouts()
   315  
   316  	ctx, cancel := tos.TimeoutContext(ctx, client.TimeoutTrafficManagerConnect)
   317  	defer cancel()
   318  
   319  	userinfo, err := user.Current()
   320  	if err != nil {
   321  		return nil, fmt.Errorf("unable to obtain current user: %w", err)
   322  	}
   323  	host, err := os.Hostname()
   324  	if err != nil {
   325  		return nil, fmt.Errorf("unable to obtain hostname: %w", err)
   326  	}
   327  
   328  	err = CheckTrafficManagerService(ctx, cluster.GetManagerNamespace())
   329  	if err != nil {
   330  		return nil, err
   331  	}
   332  
   333  	dlog.Debug(ctx, "creating port-forward")
   334  	pfDialer, err := dnet.NewK8sPortForwardDialer(ctx, cluster.Kubeconfig.RestConfig, k8sapi.GetK8sInterface(ctx))
   335  	if err != nil {
   336  		return nil, err
   337  	}
   338  	conn, mClient, vi, err := k8sclient.ConnectToManager(ctx, cluster.GetManagerNamespace(), pfDialer.Dial)
   339  	if err != nil {
   340  		return nil, err
   341  	}
   342  	managerVersion, err := semver.Parse(strings.TrimPrefix(vi.Version, "v"))
   343  	if err != nil {
   344  		return nil, fmt.Errorf("unable to parse manager.Version: %w", err)
   345  	}
   346  
   347  	userAndHost := fmt.Sprintf("%s@%s", userinfo.Username, host)
   348  
   349  	daemonID, err := daemon.NewIdentifier(cr.Name, cluster.Context, cluster.Namespace, proc.RunningInContainer())
   350  	if err != nil {
   351  		return nil, err
   352  	}
   353  	si, err := LoadSessionInfoFromUserCache(ctx, daemonID)
   354  	if err != nil {
   355  		return nil, err
   356  	}
   357  
   358  	svc := userd.GetService(ctx)
   359  	if si != nil {
   360  		// Check if the session is still valid in the traffic-manager by calling Remain
   361  		_, err = mClient.Remain(ctx, &manager.RemainRequest{Session: si})
   362  		if err == nil {
   363  			if ctx.Err() != nil {
   364  				// Call timed out, so the traffic-manager isn't responding at all
   365  				return nil, ctx.Err()
   366  			}
   367  			dlog.Debugf(ctx, "traffic-manager port-forward established, client was already known to the traffic-manager as %q", userAndHost)
   368  		} else {
   369  			si = nil
   370  		}
   371  	}
   372  
   373  	if si == nil {
   374  		dlog.Debugf(ctx, "traffic-manager port-forward established, making client known to the traffic-manager as %q", userAndHost)
   375  		si, err = mClient.ArriveAsClient(ctx, &manager.ClientInfo{
   376  			Name:      userAndHost,
   377  			Namespace: cluster.Namespace,
   378  			InstallId: installID,
   379  			Product:   "telepresence",
   380  			Version:   client.Version(),
   381  		})
   382  		if err != nil {
   383  			return nil, client.CheckTimeout(ctx, fmt.Errorf("manager.ArriveAsClient: %w", err))
   384  		}
   385  		if err = SaveSessionInfoToUserCache(ctx, daemonID, si); err != nil {
   386  			return nil, err
   387  		}
   388  	}
   389  
   390  	var opts []grpc.CallOption
   391  	cfg := client.GetConfig(ctx)
   392  	if mz := cfg.Grpc().MaxReceiveSize(); mz > 0 {
   393  		opts = append(opts, grpc.MaxCallRecvMsgSize(int(mz)))
   394  	}
   395  	svc.SetManagerClient(mClient, opts...)
   396  
   397  	managerName := vi.Name
   398  	if managerName == "" {
   399  		// Older traffic-managers doesn't distinguish between OSS and pro versions
   400  		managerName = "Traffic Manager"
   401  	}
   402  
   403  	extraAlsoProxy, err := parseCIDR(cr.GetAlsoProxy())
   404  	if err != nil {
   405  		return nil, fmt.Errorf("failed to parse extra also proxy: %w", err)
   406  	}
   407  
   408  	extraNeverProxy, err := parseCIDR(cr.GetNeverProxy())
   409  	if err != nil {
   410  		return nil, fmt.Errorf("failed to parse extra never proxy: %w", err)
   411  	}
   412  
   413  	extraAllow, err := parseCIDR(cr.GetAllowConflictingSubnets())
   414  	if err != nil {
   415  		return nil, fmt.Errorf("failed to parse extra allow conflicting subnets: %w", err)
   416  	}
   417  
   418  	cluster.AlsoProxy = append(cluster.AlsoProxy, extraAlsoProxy...)
   419  	cluster.NeverProxy = append(cluster.NeverProxy, extraNeverProxy...)
   420  	cluster.AllowConflictingSubnets = append(cluster.AllowConflictingSubnets, extraAllow...)
   421  
   422  	sess := &session{
   423  		Cluster:            cluster,
   424  		installID:          installID,
   425  		daemonID:           daemonID,
   426  		userAndHost:        userAndHost,
   427  		managerClient:      mClient,
   428  		managerConn:        conn,
   429  		pfDialer:           pfDialer,
   430  		managerName:        managerName,
   431  		managerVersion:     managerVersion,
   432  		sessionInfo:        si,
   433  		interceptWaiters:   make(map[string]*awaitIntercept),
   434  		wlWatcher:          newWASWatcher(),
   435  		isPodDaemon:        cr.IsPodDaemon,
   436  		done:               make(chan struct{}),
   437  		subnetViaWorkloads: cr.SubnetViaWorkloads,
   438  	}
   439  	sess.self = sess
   440  	return sess, nil
   441  }
   442  
   443  func (s *session) NewRemainRequest() *manager.RemainRequest {
   444  	return &manager.RemainRequest{Session: s.SessionInfo()}
   445  }
   446  
   447  func (s *session) Remain(ctx context.Context) error {
   448  	self := s.self
   449  	ctx, cancel := client.GetConfig(ctx).Timeouts().TimeoutContext(ctx, client.TimeoutTrafficManagerAPI)
   450  	defer cancel()
   451  	_, err := self.ManagerClient().Remain(ctx, self.NewRemainRequest())
   452  	if err != nil {
   453  		if status.Code(err) == codes.NotFound {
   454  			// Session has expired. We need to cancel the owner session and reconnect
   455  			return ErrSessionExpired
   456  		}
   457  		dlog.Errorf(ctx, "error calling Remain: %v", client.CheckTimeout(ctx, err))
   458  	}
   459  	return nil
   460  }
   461  
   462  func parseCIDR(cidr []string) ([]*iputil.Subnet, error) {
   463  	result := make([]*iputil.Subnet, 0)
   464  
   465  	if cidr == nil {
   466  		return result, nil
   467  	}
   468  
   469  	for i := range cidr {
   470  		_, ipNet, err := net.ParseCIDR(cidr[i])
   471  		if err != nil {
   472  			return nil, fmt.Errorf("failed to parse CIDR %s: %w", cidr[i], err)
   473  		}
   474  		result = append(result, (*iputil.Subnet)(ipNet))
   475  	}
   476  
   477  	return result, nil
   478  }
   479  
   480  func CheckTrafficManagerService(ctx context.Context, namespace string) error {
   481  	dlog.Debug(ctx, "checking that traffic-manager exists")
   482  	coreV1 := k8sapi.GetK8sInterface(ctx).CoreV1()
   483  	if _, err := coreV1.Services(namespace).Get(ctx, "traffic-manager", meta.GetOptions{}); err != nil {
   484  		msg := fmt.Sprintf("unable to get service traffic-manager in %s: %v", namespace, err)
   485  		se := &k8serrors.StatusError{}
   486  		if errors.As(err, &se) {
   487  			if se.Status().Code == http.StatusNotFound {
   488  				msg = "traffic manager not found, if it is not installed, please run 'telepresence helm install'. " +
   489  					"If it is installed, try connecting with a --manager-namespace to point telepresence to the namespace it's installed in."
   490  			}
   491  		}
   492  		return errcat.User.New(msg)
   493  	}
   494  	return nil
   495  }
   496  
   497  func connectError(t rpc.ConnectInfo_ErrType, err error) *rpc.ConnectInfo {
   498  	st := status.Convert(err)
   499  	for _, detail := range st.Details() {
   500  		if detail, ok := detail.(*common.Result); ok {
   501  			return &rpc.ConnectInfo{
   502  				Error:         t,
   503  				ErrorText:     string(detail.Data),
   504  				ErrorCategory: int32(detail.ErrorCategory),
   505  			}
   506  		}
   507  	}
   508  	return &rpc.ConnectInfo{
   509  		Error:         t,
   510  		ErrorText:     err.Error(),
   511  		ErrorCategory: int32(errcat.GetCategory(err)),
   512  	}
   513  }
   514  
   515  // updateDaemonNamespacesLocked will create a new DNS search path from the given namespaces and
   516  // send it to the DNS-resolver in the daemon.
   517  func (s *session) updateDaemonNamespaces(c context.Context) {
   518  	const svcDomain = "svc"
   519  
   520  	s.wlWatcher.setNamespacesToWatch(c, s.GetCurrentNamespaces(true))
   521  
   522  	domains := s.GetCurrentNamespaces(false)
   523  	if !slices.Contains(domains, svcDomain) {
   524  		domains = append(domains, svcDomain)
   525  	}
   526  	dlog.Debugf(c, "posting top-level domains %v to root daemon", domains)
   527  
   528  	if _, err := s.rootDaemon.SetDNSTopLevelDomains(c, &rootdRpc.Domains{Domains: domains}); err != nil {
   529  		dlog.Errorf(c, "error posting domains %v to root daemon: %v", domains, err)
   530  	}
   531  	dlog.Debug(c, "domains posted successfully")
   532  }
   533  
   534  func (s *session) Epilog(ctx context.Context) {
   535  	_, _ = s.rootDaemon.Disconnect(ctx, &empty.Empty{})
   536  	_ = s.pfDialer.Close()
   537  	dlog.Info(ctx, "-- Session ended")
   538  	close(s.done)
   539  }
   540  
   541  func (s *session) StartServices(g *dgroup.Group) {
   542  	g.Go("remain", s.remainLoop)
   543  	g.Go("intercept-port-forward", s.watchInterceptsHandler)
   544  	g.Go("dial-request-watcher", s.dialRequestWatcher)
   545  }
   546  
   547  func runWithRetry(ctx context.Context, f func(context.Context) error) error {
   548  	backoff := 100 * time.Millisecond
   549  	for ctx.Err() == nil {
   550  		if err := f(ctx); err != nil {
   551  			dlog.Error(ctx, err)
   552  			dtime.SleepWithContext(ctx, backoff)
   553  			backoff *= 2
   554  			if backoff > 3*time.Second {
   555  				backoff = 3 * time.Second
   556  			}
   557  		}
   558  	}
   559  	return nil
   560  }
   561  
   562  func (s *session) Done() <-chan struct{} {
   563  	return s.done
   564  }
   565  
   566  func (s *session) SessionInfo() *manager.SessionInfo {
   567  	return s.sessionInfo
   568  }
   569  
   570  func (s *session) ApplyConfig(ctx context.Context) error {
   571  	cfg, err := client.LoadConfig(ctx)
   572  	if err != nil {
   573  		return err
   574  	}
   575  	err = client.MergeAndReplace(ctx, s.sessionConfig, cfg, false)
   576  	if err != nil {
   577  		return err
   578  	}
   579  	if len(s.MappedNamespaces) == 0 {
   580  		mns := client.GetConfig(ctx).Cluster().MappedNamespaces
   581  		if len(mns) > 0 {
   582  			s.SetMappedNamespaces(ctx, mns)
   583  		}
   584  	}
   585  	return err
   586  }
   587  
   588  // getInfosForWorkloads returns a list of workloads found in the given namespace that fulfils the given filter criteria.
   589  func (s *session) getInfosForWorkloads(
   590  	ctx context.Context,
   591  	namespaces []string,
   592  	iMap map[string][]*manager.InterceptInfo,
   593  	sMap map[string]*rpc.WorkloadInfo_Sidecar,
   594  	filter rpc.ListRequest_Filter,
   595  ) []*rpc.WorkloadInfo {
   596  	wiMap := make(map[types.UID]*rpc.WorkloadInfo)
   597  	s.wlWatcher.eachService(ctx, s.GetManagerNamespace(), namespaces, func(svc *core.Service) {
   598  		wls, err := s.wlWatcher.findMatchingWorkloads(ctx, svc)
   599  		if err != nil {
   600  			return
   601  		}
   602  		for _, workload := range wls {
   603  			serviceUID := string(svc.UID)
   604  
   605  			if wlInfo, ok := wiMap[workload.GetUID()]; ok {
   606  				if _, ok := wlInfo.Services[serviceUID]; !ok {
   607  					wlInfo.Services[serviceUID] = &rpc.WorkloadInfo_ServiceReference{
   608  						Name:      svc.Name,
   609  						Namespace: svc.Namespace,
   610  						Ports:     getServicePorts(svc),
   611  					}
   612  				}
   613  				continue
   614  			}
   615  
   616  			name := workload.GetName()
   617  			dlog.Debugf(ctx, "Getting info for %s %s.%s, matching service %s.%s", workload.GetKind(), name, workload.GetNamespace(), svc.Name, svc.Namespace)
   618  
   619  			wlInfo := &rpc.WorkloadInfo{
   620  				Name:                 name,
   621  				Namespace:            workload.GetNamespace(),
   622  				WorkloadResourceType: workload.GetKind(),
   623  				Uid:                  string(workload.GetUID()),
   624  				Services: map[string]*rpc.WorkloadInfo_ServiceReference{
   625  					string(svc.UID): {
   626  						Name:      svc.Name,
   627  						Namespace: svc.Namespace,
   628  						Ports:     getServicePorts(svc),
   629  					},
   630  				},
   631  			}
   632  			var ok bool
   633  			if wlInfo.InterceptInfos, ok = iMap[name]; !ok && filter <= rpc.ListRequest_INTERCEPTS {
   634  				continue
   635  			}
   636  			if wlInfo.Sidecar, ok = sMap[name]; !ok && filter <= rpc.ListRequest_INSTALLED_AGENTS {
   637  				continue
   638  			}
   639  			wiMap[workload.GetUID()] = wlInfo
   640  		}
   641  	})
   642  	wiz := make([]*rpc.WorkloadInfo, len(wiMap))
   643  	i := 0
   644  	for _, wi := range wiMap {
   645  		wiz[i] = wi
   646  		i++
   647  	}
   648  	sort.Slice(wiz, func(i, j int) bool { return wiz[i].Name < wiz[j].Name })
   649  	return wiz
   650  }
   651  
   652  func getServicePorts(svc *core.Service) []*rpc.WorkloadInfo_ServiceReference_Port {
   653  	ports := make([]*rpc.WorkloadInfo_ServiceReference_Port, len(svc.Spec.Ports))
   654  	for i, p := range svc.Spec.Ports {
   655  		ports[i] = &rpc.WorkloadInfo_ServiceReference_Port{
   656  			Name: p.Name,
   657  			Port: p.Port,
   658  		}
   659  	}
   660  	return ports
   661  }
   662  
   663  func (s *session) waitForSync(ctx context.Context) {
   664  	s.wlWatcher.setNamespacesToWatch(ctx, s.GetCurrentNamespaces(true))
   665  	s.wlWatcher.waitForSync(ctx)
   666  }
   667  
   668  func (s *session) WatchWorkloads(c context.Context, wr *rpc.WatchWorkloadsRequest, stream userd.WatchWorkloadsStream) error {
   669  	s.waitForSync(c)
   670  	s.ensureWatchers(c, wr.Namespaces)
   671  	sCtx, sCancel := context.WithCancel(c)
   672  	// We need to make sure the subscription ends when we leave this method, since this is the one consuming the snapshotAvailable channel.
   673  	// Otherwise, the goroutine that writes to the channel will leak.
   674  	defer sCancel()
   675  	snapshotAvailable := s.wlWatcher.subscribe(sCtx)
   676  	for {
   677  		select {
   678  		case <-c.Done(): // if context is done (usually the session's context).
   679  			return nil
   680  		case <-stream.Context().Done(): // if stream context is done.
   681  			return nil
   682  		case <-snapshotAvailable:
   683  			snapshot, err := s.workloadInfoSnapshot(c, wr.GetNamespaces(), rpc.ListRequest_INTERCEPTABLE)
   684  			if err != nil {
   685  				return status.Errorf(codes.Unavailable, "failed to create WorkloadInfoSnapshot: %v", err)
   686  			}
   687  			if err := stream.Send(snapshot); err != nil {
   688  				dlog.Errorf(c, "WatchWorkloads.Send() failed: %v", err)
   689  				return err
   690  			}
   691  		}
   692  	}
   693  }
   694  
   695  func (s *session) WorkloadInfoSnapshot(
   696  	ctx context.Context,
   697  	namespaces []string,
   698  	filter rpc.ListRequest_Filter,
   699  ) (*rpc.WorkloadInfoSnapshot, error) {
   700  	s.waitForSync(ctx)
   701  	return s.workloadInfoSnapshot(ctx, namespaces, filter)
   702  }
   703  
   704  func (s *session) ensureWatchers(ctx context.Context,
   705  	namespaces []string,
   706  ) {
   707  	dlog.Debugf(ctx, "Ensure watchers %v", namespaces)
   708  	wg := sync.WaitGroup{}
   709  	wg.Add(len(namespaces))
   710  	for _, ns := range namespaces {
   711  		if ns == "" {
   712  			ns = s.Namespace
   713  		}
   714  		wgp := &wg
   715  		s.wlWatcher.ensureStarted(ctx, ns, func(started bool) {
   716  			if started {
   717  				dlog.Debugf(ctx, "watchers for %s started", ns)
   718  			}
   719  			if wgp != nil {
   720  				wgp.Done()
   721  				wgp = nil
   722  			}
   723  		})
   724  	}
   725  	wg.Wait()
   726  }
   727  
   728  func (s *session) workloadInfoSnapshot(
   729  	ctx context.Context,
   730  	namespaces []string,
   731  	filter rpc.ListRequest_Filter,
   732  ) (*rpc.WorkloadInfoSnapshot, error) {
   733  	is := s.getCurrentIntercepts()
   734  	s.ensureWatchers(ctx, namespaces)
   735  
   736  	var nss []string
   737  	if filter == rpc.ListRequest_INTERCEPTS {
   738  		// Special case, we don't care about namespaces in general. Instead, we use the connected namespace
   739  		nss = []string{s.Namespace}
   740  	} else {
   741  		nss = make([]string, 0, len(namespaces))
   742  		for _, ns := range namespaces {
   743  			ns = s.ActualNamespace(ns)
   744  			if ns != "" {
   745  				nss = append(nss, ns)
   746  			}
   747  		}
   748  	}
   749  	if len(nss) == 0 {
   750  		// none of the namespaces are currently mapped
   751  		return &rpc.WorkloadInfoSnapshot{}, nil
   752  	}
   753  
   754  	iMap := make(map[string][]*manager.InterceptInfo, len(is))
   755  nextIs:
   756  	for _, i := range is {
   757  		for _, ns := range nss {
   758  			if i.Spec.Namespace == ns {
   759  				iMap[i.Spec.Agent] = append(iMap[i.Spec.Agent], i.InterceptInfo)
   760  				continue nextIs
   761  			}
   762  		}
   763  	}
   764  
   765  	sMap := make(map[string]*rpc.WorkloadInfo_Sidecar)
   766  	for _, ns := range nss {
   767  		for k, v := range s.getCurrentSidecarsInNamespace(ctx, ns) {
   768  			data, err := json.Marshal(v)
   769  			if err != nil {
   770  				continue
   771  			}
   772  			sMap[k] = &rpc.WorkloadInfo_Sidecar{Json: data}
   773  		}
   774  	}
   775  
   776  	workloadInfos := s.getInfosForWorkloads(ctx, nss, iMap, sMap, filter)
   777  	return &rpc.WorkloadInfoSnapshot{Workloads: workloadInfos}, nil
   778  }
   779  
   780  var ErrSessionExpired = errors.New("session expired")
   781  
   782  func (s *session) remainLoop(c context.Context) error {
   783  	ticker := time.NewTicker(5 * time.Second)
   784  	defer func() {
   785  		ticker.Stop()
   786  		c = dcontext.WithoutCancel(c)
   787  		c, cancel := context.WithTimeout(c, 3*time.Second)
   788  		defer cancel()
   789  		if _, err := s.managerClient.Depart(c, s.SessionInfo()); err != nil {
   790  			dlog.Errorf(c, "failed to depart from manager: %v", err)
   791  		} else {
   792  			// Depart succeeded so the traffic-manager has dropped the session. We should too
   793  			if err = DeleteSessionInfoFromUserCache(c, s.daemonID); err != nil {
   794  				dlog.Errorf(c, "failed to delete session from user cache: %v", err)
   795  			}
   796  		}
   797  		s.managerConn.Close()
   798  	}()
   799  
   800  	for {
   801  		select {
   802  		case <-c.Done():
   803  			return nil
   804  		case <-ticker.C:
   805  			if err := s.Remain(c); err != nil {
   806  				return err
   807  			}
   808  		}
   809  	}
   810  }
   811  
   812  func (s *session) UpdateStatus(c context.Context, cr *rpc.ConnectRequest) *rpc.ConnectInfo {
   813  	config, err := client.DaemonKubeconfig(c, cr)
   814  	if err != nil {
   815  		return connectError(rpc.ConnectInfo_CLUSTER_FAILED, err)
   816  	}
   817  
   818  	if !cr.IsPodDaemon {
   819  		envEQ := true
   820  		for k, v := range cr.Environment {
   821  			if k[0] == '-' {
   822  				if _, ok := os.LookupEnv(k[:1]); ok {
   823  					envEQ = false
   824  					break
   825  				}
   826  			} else {
   827  				if ov, ok := os.LookupEnv(k); !ok || ov != v {
   828  					envEQ = false
   829  					break
   830  				}
   831  			}
   832  		}
   833  		if !(envEQ && s.Kubeconfig.ContextServiceAndFlagsEqual(config)) {
   834  			return &rpc.ConnectInfo{
   835  				Error:            rpc.ConnectInfo_MUST_RESTART,
   836  				ClusterContext:   s.Kubeconfig.Context,
   837  				ClusterServer:    s.Kubeconfig.Server,
   838  				ClusterId:        s.GetClusterId(c),
   839  				ManagerInstallId: s.GetManagerInstallId(c),
   840  			}
   841  		}
   842  	}
   843  
   844  	namespaces := cr.MappedNamespaces
   845  	if len(namespaces) == 1 && namespaces[0] == "all" {
   846  		namespaces = nil
   847  	}
   848  	if len(namespaces) == 0 {
   849  		namespaces = client.GetConfig(c).Cluster().MappedNamespaces
   850  	}
   851  
   852  	if s.SetMappedNamespaces(c, namespaces) {
   853  		if len(namespaces) == 0 && k8sclient.CanWatchNamespaces(c) {
   854  			s.StartNamespaceWatcher(c)
   855  		}
   856  		s.currentInterceptsLock.Lock()
   857  		s.ingressInfo = nil
   858  		s.currentInterceptsLock.Unlock()
   859  	}
   860  	s.subnetViaWorkloads = cr.SubnetViaWorkloads
   861  	return s.Status(c)
   862  }
   863  
   864  func (s *session) Status(c context.Context) *rpc.ConnectInfo {
   865  	return s.status(c, false)
   866  }
   867  
   868  func (s *session) status(c context.Context, initial bool) *rpc.ConnectInfo {
   869  	cfg := s.Kubeconfig
   870  	ret := &rpc.ConnectInfo{
   871  		ClusterContext:     cfg.Context,
   872  		ClusterServer:      cfg.Server,
   873  		ClusterId:          s.GetClusterId(c),
   874  		ManagerInstallId:   s.GetManagerInstallId(c),
   875  		SessionInfo:        s.SessionInfo(),
   876  		ConnectionName:     s.daemonID.Name,
   877  		KubeFlags:          s.OriginalFlagMap,
   878  		Namespace:          s.Namespace,
   879  		Intercepts:         &manager.InterceptInfoSnapshot{Intercepts: s.getCurrentInterceptInfos()},
   880  		ManagerNamespace:   cfg.GetManagerNamespace(),
   881  		SubnetViaWorkloads: s.subnetViaWorkloads,
   882  		Version: &common.VersionInfo{
   883  			ApiVersion: client.APIVersion,
   884  			Version:    client.Version(),
   885  			Executable: client.GetExe(c),
   886  			Name:       client.DisplayName,
   887  		},
   888  	}
   889  	if !initial {
   890  		ret.Error = rpc.ConnectInfo_ALREADY_CONNECTED
   891  	}
   892  	if len(s.MappedNamespaces) > 0 || len(s.sessionConfig.Cluster().MappedNamespaces) > 0 {
   893  		ret.MappedNamespaces = s.GetCurrentNamespaces(true)
   894  	}
   895  	var err error
   896  	ret.DaemonStatus, err = s.rootDaemon.Status(c, &empty.Empty{})
   897  	if err != nil {
   898  		return connectError(rpc.ConnectInfo_DAEMON_FAILED, err)
   899  	}
   900  	return ret
   901  }
   902  
   903  // Uninstall one or all traffic-agents from the cluster if the client has sufficient credentials to do so.
   904  //
   905  // Uninstalling all or specific agents require that the client can get and update the agents ConfigMap.
   906  func (s *session) Uninstall(ctx context.Context, ur *rpc.UninstallRequest) (*common.Result, error) {
   907  	api := k8sapi.GetK8sInterface(ctx).CoreV1()
   908  	loadAgentConfigMap := func(ns string) (*core.ConfigMap, error) {
   909  		cm, err := api.ConfigMaps(ns).Get(ctx, agentconfig.ConfigMap, meta.GetOptions{})
   910  		if err != nil {
   911  			if k8serrors.IsNotFound(err) {
   912  				// there are no agents to remove
   913  				return nil, nil
   914  			}
   915  			// TODO: find out if this is due to lack of access credentials and if so, report using errcat.User with more meaningful message
   916  			return nil, err
   917  		}
   918  		return cm, nil
   919  	}
   920  
   921  	updateAgentConfigMap := func(ns string, cm *core.ConfigMap) error {
   922  		_, err := api.ConfigMaps(ns).Update(ctx, cm, meta.UpdateOptions{})
   923  		return err
   924  	}
   925  
   926  	// Removal of agents requested. We need the agents ConfigMap in order to do that.
   927  	// This removal is deliberately done in the client instead of the traffic-manager so that RBAC can be configured
   928  	// to prevent the clients from doing it.
   929  	if ur.UninstallType == rpc.UninstallRequest_NAMED_AGENTS {
   930  		// must have a valid namespace in order to uninstall named agents
   931  		s.waitForSync(ctx)
   932  		if ur.Namespace == "" {
   933  			ur.Namespace = s.Namespace
   934  		}
   935  		s.wlWatcher.ensureStarted(ctx, ur.Namespace, nil)
   936  		namespace := s.ActualNamespace(ur.Namespace)
   937  		if namespace == "" {
   938  			// namespace is not mapped
   939  			return errcat.ToResult(errcat.User.Newf("namespace %s is not mapped", ur.Namespace)), nil
   940  		}
   941  		cm, err := loadAgentConfigMap(namespace)
   942  		if err != nil || cm == nil {
   943  			return errcat.ToResult(err), nil
   944  		}
   945  		changed := false
   946  		ics := s.getCurrentIntercepts()
   947  		for _, an := range ur.Agents {
   948  			for _, ic := range ics {
   949  				if ic.Spec.Namespace == namespace && ic.Spec.Agent == an {
   950  					_ = s.removeIntercept(ctx, ic)
   951  					break
   952  				}
   953  			}
   954  			if _, ok := cm.Data[an]; ok {
   955  				delete(cm.Data, an)
   956  				changed = true
   957  			}
   958  		}
   959  		if changed {
   960  			return errcat.ToResult(updateAgentConfigMap(namespace, cm)), nil
   961  		}
   962  		return errcat.ToResult(nil), nil
   963  	}
   964  	if ur.UninstallType != rpc.UninstallRequest_ALL_AGENTS {
   965  		return nil, status.Error(codes.InvalidArgument, "invalid uninstall request")
   966  	}
   967  
   968  	_ = s.ClearIntercepts(ctx)
   969  	clearAgentsConfigMap := func(ns string) error {
   970  		cm, err := loadAgentConfigMap(ns)
   971  		if err != nil {
   972  			return err
   973  		}
   974  		if cm == nil {
   975  			return nil
   976  		}
   977  		if len(cm.Data) > 0 {
   978  			cm.Data = nil
   979  			return updateAgentConfigMap(ns, cm)
   980  		}
   981  		return nil
   982  	}
   983  
   984  	if ur.Namespace != "" {
   985  		s.waitForSync(ctx)
   986  		if ur.Namespace == "" {
   987  			ur.Namespace = s.Namespace
   988  		}
   989  		s.wlWatcher.ensureStarted(ctx, ur.Namespace, nil)
   990  		namespace := s.ActualNamespace(ur.Namespace)
   991  		if namespace == "" {
   992  			// namespace is not mapped
   993  			return errcat.ToResult(errcat.User.Newf("namespace %s is not mapped", ur.Namespace)), nil
   994  		}
   995  		return errcat.ToResult(clearAgentsConfigMap(namespace)), nil
   996  	} else {
   997  		// Load all effected configmaps
   998  		for _, ns := range s.GetCurrentNamespaces(true) {
   999  			err := clearAgentsConfigMap(ns)
  1000  			if err != nil {
  1001  				return errcat.ToResult(err), nil
  1002  			}
  1003  		}
  1004  	}
  1005  	return errcat.ToResult(nil), nil
  1006  }
  1007  
  1008  func (s *session) getOutboundInfo(ctx context.Context, cr *rpc.ConnectRequest) *rootdRpc.OutboundInfo {
  1009  	// We'll figure out the IP address of the API server(s) so that we can tell the daemon never to proxy them.
  1010  	// This is because in some setups the API server will be in the same CIDR range as the pods, and the
  1011  	// daemon will attempt to proxy traffic to it. This usually results in a loss of all traffic to/from
  1012  	// the cluster, since an open tunnel to the traffic-manager (via the API server) is itself required
  1013  	// to communicate with the cluster.
  1014  	neverProxy := make([]*manager.IPNet, 0, 1+len(s.NeverProxy))
  1015  	serverURL, err := url.Parse(s.Server)
  1016  	if err != nil {
  1017  		// This really shouldn't happen as we are connected to the server
  1018  		dlog.Errorf(ctx, "Unable to parse url for k8s server %s: %v", s.Server, err)
  1019  	} else {
  1020  		hostname := serverURL.Hostname()
  1021  		rawIP := iputil.Parse(hostname)
  1022  		ips := []net.IP{rawIP}
  1023  		if rawIP == nil {
  1024  			var err error
  1025  			ips, err = net.LookupIP(hostname)
  1026  			if err != nil {
  1027  				dlog.Errorf(ctx, "Unable to do DNS lookup for k8s server %s: %v", hostname, err)
  1028  				ips = []net.IP{}
  1029  			}
  1030  		}
  1031  		for _, ip := range ips {
  1032  			mask := net.CIDRMask(128, 128)
  1033  			if ipv4 := ip.To4(); ipv4 != nil {
  1034  				mask = net.CIDRMask(32, 32)
  1035  				ip = ipv4
  1036  			}
  1037  			if !ip.IsLoopback() {
  1038  				ipnet := &net.IPNet{IP: ip, Mask: mask}
  1039  				neverProxy = append(neverProxy, iputil.IPNetToRPC(ipnet))
  1040  			}
  1041  		}
  1042  	}
  1043  	for _, np := range s.NeverProxy {
  1044  		neverProxy = append(neverProxy, iputil.IPNetToRPC((*net.IPNet)(np)))
  1045  	}
  1046  	info := &rootdRpc.OutboundInfo{
  1047  		Session:            s.sessionInfo,
  1048  		NeverProxySubnets:  neverProxy,
  1049  		HomeDir:            homedir.HomeDir(),
  1050  		Namespace:          s.Namespace,
  1051  		ManagerNamespace:   s.GetManagerNamespace(),
  1052  		SubnetViaWorkloads: s.subnetViaWorkloads,
  1053  		KubeFlags:          cr.KubeFlags,
  1054  		KubeconfigData:     cr.KubeconfigData,
  1055  	}
  1056  
  1057  	if s.DNS != nil {
  1058  		info.Dns = &rootdRpc.DNSConfig{
  1059  			ExcludeSuffixes: s.DNS.ExcludeSuffixes,
  1060  			IncludeSuffixes: s.DNS.IncludeSuffixes,
  1061  			Excludes:        s.DNS.Excludes,
  1062  			Mappings:        s.DNS.Mappings.ToRPC(),
  1063  			LookupTimeout:   durationpb.New(s.DNS.LookupTimeout.Duration),
  1064  		}
  1065  		if len(s.DNS.LocalIP) > 0 {
  1066  			info.Dns.LocalIp = s.DNS.LocalIP.IP()
  1067  		}
  1068  		if len(s.DNS.RemoteIP) > 0 {
  1069  			info.Dns.RemoteIp = s.DNS.RemoteIP.IP()
  1070  		}
  1071  	}
  1072  
  1073  	if len(s.AlsoProxy) > 0 {
  1074  		info.AlsoProxySubnets = make([]*manager.IPNet, len(s.AlsoProxy))
  1075  		for i, ap := range s.AlsoProxy {
  1076  			info.AlsoProxySubnets[i] = iputil.IPNetToRPC((*net.IPNet)(ap))
  1077  		}
  1078  	}
  1079  	if len(s.AllowConflictingSubnets) > 0 {
  1080  		info.AllowConflictingSubnets = make([]*manager.IPNet, len(s.AllowConflictingSubnets))
  1081  		for i, ap := range s.AllowConflictingSubnets {
  1082  			info.AllowConflictingSubnets[i] = iputil.IPNetToRPC((*net.IPNet)(ap))
  1083  		}
  1084  	}
  1085  	return info
  1086  }
  1087  
  1088  func (s *session) connectRootDaemon(ctx context.Context, oi *rootdRpc.OutboundInfo, isPodDaemon bool) (rd rootdRpc.DaemonClient, err error) {
  1089  	// establish a connection to the root daemon gRPC grpcService
  1090  	dlog.Info(ctx, "Connecting to root daemon...")
  1091  	svc := userd.GetService(ctx)
  1092  	if svc.RootSessionInProcess() {
  1093  		// Just run the root session in-process.
  1094  		rootSession, err := rootd.NewInProcSession(ctx, oi, s.managerClient, s.managerVersion, isPodDaemon)
  1095  		if err != nil {
  1096  			return nil, err
  1097  		}
  1098  		if err = rootSession.Start(ctx, dgroup.NewGroup(ctx, dgroup.GroupConfig{})); err != nil {
  1099  			return nil, err
  1100  		}
  1101  		rd = rootSession
  1102  	} else {
  1103  		var conn *grpc.ClientConn
  1104  		conn, err = socket.Dial(ctx, socket.RootDaemonPath(ctx),
  1105  			grpc.WithStatsHandler(otelgrpc.NewClientHandler()),
  1106  		)
  1107  		if err != nil {
  1108  			return nil, fmt.Errorf("unable open root daemon socket: %w", err)
  1109  		}
  1110  		defer func() {
  1111  			if err != nil {
  1112  				conn.Close()
  1113  			}
  1114  		}()
  1115  		rd = rootdRpc.NewDaemonClient(conn)
  1116  
  1117  		for attempt := 1; ; attempt++ {
  1118  			var rootStatus *rootdRpc.DaemonStatus
  1119  			if rootStatus, err = rd.Connect(ctx, oi); err != nil {
  1120  				return nil, fmt.Errorf("failed to connect to root daemon: %w", err)
  1121  			}
  1122  			oc := rootStatus.OutboundConfig
  1123  			if oc == nil || oc.Session == nil {
  1124  				// This is an internal error. Something is wrong with the root daemon.
  1125  				return nil, errors.New("root daemon's OutboundConfig has no Session")
  1126  			}
  1127  			if oc.Session.SessionId == oi.Session.SessionId {
  1128  				break
  1129  			}
  1130  
  1131  			// Root daemon was running an old session. This indicates that this daemon somehow
  1132  			// crashed without disconnecting. So let's do that now, and then reconnect...
  1133  			if attempt == 2 {
  1134  				// ...or not, since we've already done it.
  1135  				return nil, errors.New("unable to reconnect to root daemon")
  1136  			}
  1137  			if _, err = rd.Disconnect(ctx, &empty.Empty{}); err != nil {
  1138  				return nil, fmt.Errorf("failed to disconnect from the root daemon: %w", err)
  1139  			}
  1140  		}
  1141  	}
  1142  
  1143  	// The root daemon needs time to set up the TUN-device and DNS, which involves interacting
  1144  	// with the cluster-side traffic-manager. We know that the traffic-manager is up and
  1145  	// responding at this point, so it shouldn't take too long.
  1146  	ctx, cancel := client.GetConfig(ctx).Timeouts().TimeoutContext(ctx, client.TimeoutTrafficManagerAPI)
  1147  	defer cancel()
  1148  	if _, err = rd.WaitForNetwork(ctx, &empty.Empty{}); err != nil {
  1149  		if se, ok := status.FromError(err); ok {
  1150  			err = se.Err()
  1151  		}
  1152  		return nil, fmt.Errorf("failed to connect to root daemon: %v", err)
  1153  	}
  1154  	dlog.Debug(ctx, "Connected to root daemon")
  1155  	return rd, nil
  1156  }