github.com/telepresenceio/telepresence/v2@v2.20.0-pro.6.0.20240517030216-236ea954e789/pkg/client/userd/trafficmgr/tracing.go (about) 1 package trafficmgr 2 3 import ( 4 "compress/gzip" 5 "context" 6 "fmt" 7 "net" 8 "os" 9 "path/filepath" 10 "strconv" 11 "sync" 12 "time" 13 14 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" 15 "go.opentelemetry.io/otel" 16 "go.opentelemetry.io/otel/attribute" 17 "go.opentelemetry.io/otel/codes" 18 "go.opentelemetry.io/otel/trace" 19 "google.golang.org/grpc" 20 "google.golang.org/grpc/credentials/insecure" 21 "google.golang.org/protobuf/types/known/emptypb" 22 core "k8s.io/api/core/v1" 23 typed "k8s.io/client-go/kubernetes/typed/core/v1" 24 25 "github.com/datawire/dlib/dlog" 26 "github.com/telepresenceio/telepresence/rpc/v2/common" 27 "github.com/telepresenceio/telepresence/rpc/v2/connector" 28 "github.com/telepresenceio/telepresence/v2/pkg/client" 29 "github.com/telepresenceio/telepresence/v2/pkg/client/socket" 30 "github.com/telepresenceio/telepresence/v2/pkg/errcat" 31 ) 32 33 type traceCollector struct { 34 *connector.TracesRequest 35 } 36 37 func (*traceCollector) tracesFor(ctx context.Context, conn *grpc.ClientConn, ch chan<- []byte, component string) error { 38 ctx, span := otel.GetTracerProvider().Tracer("").Start(ctx, "tracesFor", trace.WithAttributes(attribute.String("component", component))) 39 defer span.End() 40 cli := common.NewTracingClient(conn) 41 cfg := client.GetConfig(ctx) 42 maxRecSize := int64(1024 * 1024 * 20) // Default to 20 Mb here. There might be a lot of traces. 43 if mz := cfg.Grpc().MaxReceiveSize(); mz > maxRecSize { 44 maxRecSize = mz 45 } 46 result, err := cli.DumpTraces(ctx, &emptypb.Empty{}, grpc.MaxCallRecvMsgSize(int(maxRecSize))) 47 if err != nil { 48 span.RecordError(err) 49 span.SetStatus(codes.Error, err.Error()) 50 return err 51 } 52 data := result.GetTraceData() 53 select { 54 case ch <- data: 55 case <-ctx.Done(): 56 } 57 return nil 58 } 59 60 func (*traceCollector) launchTraceWriter(ctx context.Context, destFile string) (chan<- []byte, <-chan error, error) { 61 ch := make(chan []byte) 62 var err error 63 if destFile, err = filepath.Abs(destFile); err != nil { 64 return nil, nil, err 65 } 66 file, err := os.Create(destFile) 67 if err != nil { 68 return nil, nil, fmt.Errorf("failed to create trace file: %w", err) 69 } 70 errCh := make(chan error) 71 72 go func() { 73 zipW := gzip.NewWriter(file) 74 defer func() { 75 err = zipW.Close() 76 if err != nil { 77 errCh <- err 78 return 79 } 80 err = file.Close() 81 if err != nil { 82 errCh <- err 83 return 84 } 85 close(errCh) 86 }() 87 88 for { 89 select { 90 case <-ctx.Done(): 91 return 92 case data, ok := <-ch: 93 if !ok { 94 return 95 } 96 _, err := zipW.Write(data) 97 if err != nil { 98 errCh <- err 99 return 100 } 101 } 102 } 103 }() 104 return ch, errCh, nil 105 } 106 107 func (c *traceCollector) userdTraces(ctx context.Context, tCh chan<- []byte) error { 108 userdConn, err := socket.Dial(ctx, socket.UserDaemonPath(ctx), grpc.WithStatsHandler(otelgrpc.NewClientHandler())) 109 if err != nil { 110 return err 111 } 112 defer userdConn.Close() 113 114 return c.tracesFor(ctx, userdConn, tCh, "user-daemon") 115 } 116 117 func (c *traceCollector) rootdTraces(ctx context.Context, tCh chan<- []byte) error { 118 dConn, err := socket.Dial(ctx, socket.RootDaemonPath(ctx), grpc.WithStatsHandler(otelgrpc.NewClientHandler())) 119 if err != nil { 120 return err 121 } 122 defer dConn.Close() 123 124 return c.tracesFor(ctx, dConn, tCh, "root-daemon") 125 } 126 127 func (c *traceCollector) trafficManagerTraces(ctx context.Context, sess *session, tCh chan<- []byte, remotePort string) error { 128 span := trace.SpanFromContext(ctx) 129 host := "svc/traffic-manager." + sess.GetManagerNamespace() 130 grpcAddr := net.JoinHostPort(host, remotePort) 131 span.SetAttributes(attribute.String("traffic-manager.host", host), attribute.String("traffic-manager.port", remotePort)) 132 tc, tCancel := context.WithTimeout(ctx, 20*time.Second) 133 defer tCancel() 134 135 opts := []grpc.DialOption{ 136 grpc.WithContextDialer(sess.pfDialer.Dial), 137 grpc.WithTransportCredentials(insecure.NewCredentials()), 138 grpc.WithNoProxy(), 139 grpc.WithBlock(), 140 grpc.WithReturnConnectionError(), 141 grpc.WithStatsHandler(otelgrpc.NewClientHandler()), 142 } 143 144 conn, err := grpc.DialContext(tc, grpcAddr, opts...) 145 if err != nil { 146 return err 147 } 148 return c.tracesFor(ctx, conn, tCh, "traffic-manager") 149 } 150 151 func (c *traceCollector) agentTraces(ctx context.Context, sess *session, tCh chan<- []byte, remotePort string) error { 152 return sess.ForeachAgentPod(ctx, func(ctx context.Context, pi typed.PodInterface, pod *core.Pod) { 153 span := trace.SpanFromContext(ctx) 154 name := fmt.Sprintf("%s.%s", pod.Name, pod.Namespace) 155 addr := net.JoinHostPort(name, remotePort) 156 tc, tCancel := context.WithTimeout(ctx, 20*time.Second) 157 defer tCancel() 158 159 opts := []grpc.DialOption{ 160 grpc.WithContextDialer(sess.pfDialer.Dial), 161 grpc.WithTransportCredentials(insecure.NewCredentials()), 162 grpc.WithNoProxy(), 163 grpc.WithBlock(), 164 grpc.WithReturnConnectionError(), 165 grpc.WithStatsHandler(otelgrpc.NewClientHandler()), 166 } 167 168 conn, err := grpc.DialContext(tc, addr, opts...) 169 if err != nil { 170 err := fmt.Errorf("error getting traffic-agent traces for %s: %v", name, err) 171 span.RecordError(err, trace.WithAttributes( 172 attribute.String("host", name), 173 attribute.String("port", remotePort), 174 )) 175 dlog.Error(ctx, err) 176 return 177 } 178 defer conn.Close() 179 err = c.tracesFor(tc, conn, tCh, "traffic-agent") 180 if err != nil { 181 err := fmt.Errorf("error getting traffic-agent traces for %s: %v", name, err) 182 span.RecordError(err, trace.WithAttributes( 183 attribute.String("traffic-agent.host", name), 184 attribute.String("traffic-agent.port", remotePort), 185 )) 186 dlog.Error(ctx, err) 187 return 188 } 189 }, nil) 190 } 191 192 func (s *session) GatherTraces(ctx context.Context, tr *connector.TracesRequest) *common.Result { 193 return errcat.ToResult((&traceCollector{tr}).gatherTraces(ctx, s)) 194 } 195 196 func (c *traceCollector) gatherTraces(ctx context.Context, sess *session) error { 197 // Since we want this trace to show up in the gather traces output file, we'll declare it as a root trace and end it right after awaiting the wait group 198 ctx, span := otel.GetTracerProvider().Tracer("").Start(ctx, "gather-traces", trace.WithNewRoot()) 199 port := strconv.FormatUint(uint64(c.RemotePort), 10) 200 201 tCh, errCh, err := c.launchTraceWriter(ctx, c.TracingFile) 202 if err != nil { 203 return err 204 } 205 206 wg := &sync.WaitGroup{} 207 wg.Add(3) 208 209 go func() { 210 defer wg.Done() 211 err := c.rootdTraces(ctx, tCh) 212 if err != nil { 213 err := fmt.Errorf("failed to collect root daemon traces: %v", err) 214 span.RecordError(err) 215 dlog.Error(ctx, err) 216 } 217 }() 218 219 go func() { 220 defer wg.Done() 221 err = c.trafficManagerTraces(ctx, sess, tCh, port) 222 if err != nil { 223 err := fmt.Errorf("failed to collect traffic-manager traces: %v", err) 224 span.RecordError(err) 225 dlog.Error(ctx, err) 226 } 227 }() 228 229 go func() { 230 defer wg.Done() 231 err := c.agentTraces(ctx, sess, tCh, port) 232 if err != nil { 233 err := fmt.Errorf("failed to collect traffic agent traces: %v", err) 234 span.RecordError(err) 235 dlog.Error(ctx, err) 236 } 237 }() 238 239 wg.Wait() 240 // End span so it gets reported via userdTraces 241 span.End() 242 // These go after the other traces so that we can capture traces from the gathering of traces itself 243 err = c.userdTraces(ctx, tCh) 244 if err != nil { 245 // Can't imagine this makes a difference, since we've failed to collect it, but we may as well record it 246 err = fmt.Errorf("failed to collect user daemon traces: %v\n", err) 247 span.RecordError(err) 248 dlog.Error(ctx, err) 249 } 250 251 close(tCh) 252 err = <-errCh 253 if err != nil { 254 return err 255 } 256 return nil 257 }