github.com/telepresenceio/telepresence/v2@v2.20.0-pro.6.0.20240517030216-236ea954e789/pkg/client/userd/trafficmgr/tracing.go (about)

     1  package trafficmgr
     2  
     3  import (
     4  	"compress/gzip"
     5  	"context"
     6  	"fmt"
     7  	"net"
     8  	"os"
     9  	"path/filepath"
    10  	"strconv"
    11  	"sync"
    12  	"time"
    13  
    14  	"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
    15  	"go.opentelemetry.io/otel"
    16  	"go.opentelemetry.io/otel/attribute"
    17  	"go.opentelemetry.io/otel/codes"
    18  	"go.opentelemetry.io/otel/trace"
    19  	"google.golang.org/grpc"
    20  	"google.golang.org/grpc/credentials/insecure"
    21  	"google.golang.org/protobuf/types/known/emptypb"
    22  	core "k8s.io/api/core/v1"
    23  	typed "k8s.io/client-go/kubernetes/typed/core/v1"
    24  
    25  	"github.com/datawire/dlib/dlog"
    26  	"github.com/telepresenceio/telepresence/rpc/v2/common"
    27  	"github.com/telepresenceio/telepresence/rpc/v2/connector"
    28  	"github.com/telepresenceio/telepresence/v2/pkg/client"
    29  	"github.com/telepresenceio/telepresence/v2/pkg/client/socket"
    30  	"github.com/telepresenceio/telepresence/v2/pkg/errcat"
    31  )
    32  
    33  type traceCollector struct {
    34  	*connector.TracesRequest
    35  }
    36  
    37  func (*traceCollector) tracesFor(ctx context.Context, conn *grpc.ClientConn, ch chan<- []byte, component string) error {
    38  	ctx, span := otel.GetTracerProvider().Tracer("").Start(ctx, "tracesFor", trace.WithAttributes(attribute.String("component", component)))
    39  	defer span.End()
    40  	cli := common.NewTracingClient(conn)
    41  	cfg := client.GetConfig(ctx)
    42  	maxRecSize := int64(1024 * 1024 * 20) // Default to 20 Mb here. There might be a lot of traces.
    43  	if mz := cfg.Grpc().MaxReceiveSize(); mz > maxRecSize {
    44  		maxRecSize = mz
    45  	}
    46  	result, err := cli.DumpTraces(ctx, &emptypb.Empty{}, grpc.MaxCallRecvMsgSize(int(maxRecSize)))
    47  	if err != nil {
    48  		span.RecordError(err)
    49  		span.SetStatus(codes.Error, err.Error())
    50  		return err
    51  	}
    52  	data := result.GetTraceData()
    53  	select {
    54  	case ch <- data:
    55  	case <-ctx.Done():
    56  	}
    57  	return nil
    58  }
    59  
    60  func (*traceCollector) launchTraceWriter(ctx context.Context, destFile string) (chan<- []byte, <-chan error, error) {
    61  	ch := make(chan []byte)
    62  	var err error
    63  	if destFile, err = filepath.Abs(destFile); err != nil {
    64  		return nil, nil, err
    65  	}
    66  	file, err := os.Create(destFile)
    67  	if err != nil {
    68  		return nil, nil, fmt.Errorf("failed to create trace file: %w", err)
    69  	}
    70  	errCh := make(chan error)
    71  
    72  	go func() {
    73  		zipW := gzip.NewWriter(file)
    74  		defer func() {
    75  			err = zipW.Close()
    76  			if err != nil {
    77  				errCh <- err
    78  				return
    79  			}
    80  			err = file.Close()
    81  			if err != nil {
    82  				errCh <- err
    83  				return
    84  			}
    85  			close(errCh)
    86  		}()
    87  
    88  		for {
    89  			select {
    90  			case <-ctx.Done():
    91  				return
    92  			case data, ok := <-ch:
    93  				if !ok {
    94  					return
    95  				}
    96  				_, err := zipW.Write(data)
    97  				if err != nil {
    98  					errCh <- err
    99  					return
   100  				}
   101  			}
   102  		}
   103  	}()
   104  	return ch, errCh, nil
   105  }
   106  
   107  func (c *traceCollector) userdTraces(ctx context.Context, tCh chan<- []byte) error {
   108  	userdConn, err := socket.Dial(ctx, socket.UserDaemonPath(ctx), grpc.WithStatsHandler(otelgrpc.NewClientHandler()))
   109  	if err != nil {
   110  		return err
   111  	}
   112  	defer userdConn.Close()
   113  
   114  	return c.tracesFor(ctx, userdConn, tCh, "user-daemon")
   115  }
   116  
   117  func (c *traceCollector) rootdTraces(ctx context.Context, tCh chan<- []byte) error {
   118  	dConn, err := socket.Dial(ctx, socket.RootDaemonPath(ctx), grpc.WithStatsHandler(otelgrpc.NewClientHandler()))
   119  	if err != nil {
   120  		return err
   121  	}
   122  	defer dConn.Close()
   123  
   124  	return c.tracesFor(ctx, dConn, tCh, "root-daemon")
   125  }
   126  
   127  func (c *traceCollector) trafficManagerTraces(ctx context.Context, sess *session, tCh chan<- []byte, remotePort string) error {
   128  	span := trace.SpanFromContext(ctx)
   129  	host := "svc/traffic-manager." + sess.GetManagerNamespace()
   130  	grpcAddr := net.JoinHostPort(host, remotePort)
   131  	span.SetAttributes(attribute.String("traffic-manager.host", host), attribute.String("traffic-manager.port", remotePort))
   132  	tc, tCancel := context.WithTimeout(ctx, 20*time.Second)
   133  	defer tCancel()
   134  
   135  	opts := []grpc.DialOption{
   136  		grpc.WithContextDialer(sess.pfDialer.Dial),
   137  		grpc.WithTransportCredentials(insecure.NewCredentials()),
   138  		grpc.WithNoProxy(),
   139  		grpc.WithBlock(),
   140  		grpc.WithReturnConnectionError(),
   141  		grpc.WithStatsHandler(otelgrpc.NewClientHandler()),
   142  	}
   143  
   144  	conn, err := grpc.DialContext(tc, grpcAddr, opts...)
   145  	if err != nil {
   146  		return err
   147  	}
   148  	return c.tracesFor(ctx, conn, tCh, "traffic-manager")
   149  }
   150  
   151  func (c *traceCollector) agentTraces(ctx context.Context, sess *session, tCh chan<- []byte, remotePort string) error {
   152  	return sess.ForeachAgentPod(ctx, func(ctx context.Context, pi typed.PodInterface, pod *core.Pod) {
   153  		span := trace.SpanFromContext(ctx)
   154  		name := fmt.Sprintf("%s.%s", pod.Name, pod.Namespace)
   155  		addr := net.JoinHostPort(name, remotePort)
   156  		tc, tCancel := context.WithTimeout(ctx, 20*time.Second)
   157  		defer tCancel()
   158  
   159  		opts := []grpc.DialOption{
   160  			grpc.WithContextDialer(sess.pfDialer.Dial),
   161  			grpc.WithTransportCredentials(insecure.NewCredentials()),
   162  			grpc.WithNoProxy(),
   163  			grpc.WithBlock(),
   164  			grpc.WithReturnConnectionError(),
   165  			grpc.WithStatsHandler(otelgrpc.NewClientHandler()),
   166  		}
   167  
   168  		conn, err := grpc.DialContext(tc, addr, opts...)
   169  		if err != nil {
   170  			err := fmt.Errorf("error getting traffic-agent traces for %s: %v", name, err)
   171  			span.RecordError(err, trace.WithAttributes(
   172  				attribute.String("host", name),
   173  				attribute.String("port", remotePort),
   174  			))
   175  			dlog.Error(ctx, err)
   176  			return
   177  		}
   178  		defer conn.Close()
   179  		err = c.tracesFor(tc, conn, tCh, "traffic-agent")
   180  		if err != nil {
   181  			err := fmt.Errorf("error getting traffic-agent traces for %s: %v", name, err)
   182  			span.RecordError(err, trace.WithAttributes(
   183  				attribute.String("traffic-agent.host", name),
   184  				attribute.String("traffic-agent.port", remotePort),
   185  			))
   186  			dlog.Error(ctx, err)
   187  			return
   188  		}
   189  	}, nil)
   190  }
   191  
   192  func (s *session) GatherTraces(ctx context.Context, tr *connector.TracesRequest) *common.Result {
   193  	return errcat.ToResult((&traceCollector{tr}).gatherTraces(ctx, s))
   194  }
   195  
   196  func (c *traceCollector) gatherTraces(ctx context.Context, sess *session) error {
   197  	// Since we want this trace to show up in the gather traces output file, we'll declare it as a root trace and end it right after awaiting the wait group
   198  	ctx, span := otel.GetTracerProvider().Tracer("").Start(ctx, "gather-traces", trace.WithNewRoot())
   199  	port := strconv.FormatUint(uint64(c.RemotePort), 10)
   200  
   201  	tCh, errCh, err := c.launchTraceWriter(ctx, c.TracingFile)
   202  	if err != nil {
   203  		return err
   204  	}
   205  
   206  	wg := &sync.WaitGroup{}
   207  	wg.Add(3)
   208  
   209  	go func() {
   210  		defer wg.Done()
   211  		err := c.rootdTraces(ctx, tCh)
   212  		if err != nil {
   213  			err := fmt.Errorf("failed to collect root daemon traces: %v", err)
   214  			span.RecordError(err)
   215  			dlog.Error(ctx, err)
   216  		}
   217  	}()
   218  
   219  	go func() {
   220  		defer wg.Done()
   221  		err = c.trafficManagerTraces(ctx, sess, tCh, port)
   222  		if err != nil {
   223  			err := fmt.Errorf("failed to collect traffic-manager traces: %v", err)
   224  			span.RecordError(err)
   225  			dlog.Error(ctx, err)
   226  		}
   227  	}()
   228  
   229  	go func() {
   230  		defer wg.Done()
   231  		err := c.agentTraces(ctx, sess, tCh, port)
   232  		if err != nil {
   233  			err := fmt.Errorf("failed to collect traffic agent traces: %v", err)
   234  			span.RecordError(err)
   235  			dlog.Error(ctx, err)
   236  		}
   237  	}()
   238  
   239  	wg.Wait()
   240  	// End span so it gets reported via userdTraces
   241  	span.End()
   242  	// These go after the other traces so that we can capture traces from the gathering of traces itself
   243  	err = c.userdTraces(ctx, tCh)
   244  	if err != nil {
   245  		// Can't imagine this makes a difference, since we've failed to collect it, but we may as well record it
   246  		err = fmt.Errorf("failed to collect user daemon traces: %v\n", err)
   247  		span.RecordError(err)
   248  		dlog.Error(ctx, err)
   249  	}
   250  
   251  	close(tCh)
   252  	err = <-errCh
   253  	if err != nil {
   254  		return err
   255  	}
   256  	return nil
   257  }