github.com/instana/go-sensor@v1.62.2-0.20240520081010-4919868049e1/fargate_agent.go (about) 1 // (c) Copyright IBM Corp. 2021 2 // (c) Copyright Instana Inc. 2020 3 4 package instana 5 6 import ( 7 "bytes" 8 "context" 9 "encoding/json" 10 "fmt" 11 "io" 12 "io/ioutil" 13 "net/http" 14 "os" 15 "strconv" 16 "sync" 17 "time" 18 19 "github.com/instana/go-sensor/acceptor" 20 "github.com/instana/go-sensor/autoprofile" 21 "github.com/instana/go-sensor/aws" 22 "github.com/instana/go-sensor/docker" 23 ) 24 25 type fargateSnapshot struct { 26 Service serverlessSnapshot 27 Task aws.ECSTaskMetadata 28 Container aws.ECSContainerMetadata 29 } 30 31 func newFargateSnapshot(pid int, taskMD aws.ECSTaskMetadata, containerMD aws.ECSContainerMetadata) fargateSnapshot { 32 return fargateSnapshot{ 33 Service: serverlessSnapshot{ 34 EntityID: ecsEntityID(containerMD), 35 Host: containerMD.TaskARN, 36 PID: pid, 37 StartedAt: processStartedAt, 38 Container: containerSnapshot{ 39 ID: containerMD.DockerID, 40 Type: "docker", 41 Image: containerMD.Image, 42 }, 43 }, 44 Task: taskMD, 45 Container: containerMD, 46 } 47 } 48 49 func newECSTaskPluginPayload(snapshot fargateSnapshot) acceptor.PluginPayload { 50 return acceptor.NewECSTaskPluginPayload(snapshot.Task.TaskARN, acceptor.ECSTaskData{ 51 TaskARN: snapshot.Task.TaskARN, 52 ClusterARN: snapshot.Container.Cluster, 53 AvailabilityZone: snapshot.Task.AvailabilityZone, 54 InstanaZone: snapshot.Service.Zone, 55 TaskDefinition: snapshot.Task.Family, 56 TaskDefinitionVersion: snapshot.Task.Revision, 57 DesiredStatus: snapshot.Task.DesiredStatus, 58 KnownStatus: snapshot.Task.KnownStatus, 59 Limits: acceptor.AWSContainerLimits{ 60 CPU: snapshot.Container.Limits.CPU, 61 Memory: snapshot.Container.Limits.Memory, 62 }, 63 PullStartedAt: snapshot.Task.PullStartedAt, 64 PullStoppedAt: snapshot.Task.PullStoppedAt, 65 Tags: snapshot.Service.Tags, 66 }) 67 } 68 69 func newECSContainerPluginPayload(container aws.ECSContainerMetadata, instrumented bool) acceptor.PluginPayload { 70 data := acceptor.ECSContainerData{ 71 Instrumented: instrumented, 72 DockerID: container.DockerID, 73 DockerName: container.DockerName, 74 ContainerName: container.Name, 75 Image: container.Image, 76 ImageID: container.ImageID, 77 TaskARN: container.TaskARN, 78 TaskDefinition: container.TaskDefinition, 79 TaskDefinitionVersion: container.TaskDefinitionVersion, 80 ClusterARN: container.Cluster, 81 DesiredStatus: container.DesiredStatus, 82 KnownStatus: container.KnownStatus, 83 Limits: acceptor.AWSContainerLimits{ 84 CPU: container.Limits.CPU, 85 Memory: container.Limits.Memory, 86 }, 87 CreatedAt: container.CreatedAt, 88 StartedAt: container.StartedAt, 89 Type: container.Type, 90 } 91 92 // we only know the runtime for sure for the instrumented container 93 if instrumented { 94 data.Runtime = "go" 95 } 96 97 return acceptor.NewECSContainerPluginPayload(ecsEntityID(container), data) 98 } 99 100 func newDockerContainerPluginPayload( 101 container aws.ECSContainerMetadata, 102 prevStats, currentStats docker.ContainerStats, 103 instrumented bool, 104 ) acceptor.PluginPayload { 105 106 var networkMode string 107 if len(container.Networks) > 0 { 108 networkMode = container.Networks[0].Mode 109 } 110 111 data := acceptor.DockerData{ 112 ID: container.DockerID, 113 CreatedAt: container.CreatedAt, 114 StartedAt: container.StartedAt, 115 Image: container.Image, 116 Labels: container.ContainerLabels, 117 Names: []string{container.DockerName}, 118 NetworkMode: networkMode, 119 Memory: acceptor.NewDockerMemoryStatsUpdate(prevStats.Memory, currentStats.Memory), 120 CPU: acceptor.NewDockerCPUStatsDelta(prevStats.CPU, currentStats.CPU), 121 Network: acceptor.NewDockerNetworkAggregatedStatsDelta(prevStats.Networks, currentStats.Networks), 122 BlockIO: acceptor.NewDockerBlockIOStatsDelta(prevStats.BlockIO, currentStats.BlockIO), 123 } 124 125 // we only know the command for the instrumented container 126 if instrumented { 127 data.Command = os.Args[0] 128 } 129 130 return acceptor.NewDockerPluginPayload(ecsEntityID(container), data) 131 } 132 133 type metricsPayload struct { 134 Plugins []acceptor.PluginPayload `json:"plugins"` 135 } 136 137 type fargateAgent struct { 138 Endpoint string 139 Key string 140 PID int 141 Zone string 142 Tags map[string]interface{} 143 144 snapshot fargateSnapshot 145 lastDockerStats map[string]docker.ContainerStats 146 lastProcessStats processStats 147 148 mu sync.Mutex 149 spanQueue []Span 150 151 runtimeSnapshot *SnapshotCollector 152 dockerStats *ecsDockerStatsCollector 153 processStats *processStatsCollector 154 client *http.Client 155 ecs *aws.ECSMetadataProvider 156 logger LeveledLogger 157 } 158 159 func newFargateAgent( 160 serviceName, acceptorEndpoint, agentKey string, 161 client *http.Client, 162 mdProvider *aws.ECSMetadataProvider, 163 logger LeveledLogger, 164 ) *fargateAgent { 165 166 if logger == nil { 167 logger = defaultLogger 168 } 169 170 if client == nil { 171 client = http.DefaultClient 172 } 173 174 logger.Debug("initializing aws fargate agent") 175 176 agent := &fargateAgent{ 177 Endpoint: acceptorEndpoint, 178 Key: agentKey, 179 PID: os.Getpid(), 180 Zone: os.Getenv("INSTANA_ZONE"), 181 Tags: parseInstanaTags(os.Getenv("INSTANA_TAGS")), 182 runtimeSnapshot: &SnapshotCollector{ 183 CollectionInterval: snapshotCollectionInterval, 184 ServiceName: serviceName, 185 }, 186 dockerStats: &ecsDockerStatsCollector{ 187 ecs: mdProvider, 188 logger: logger, 189 }, 190 processStats: &processStatsCollector{ 191 logger: logger, 192 }, 193 client: client, 194 ecs: mdProvider, 195 logger: logger, 196 } 197 198 go func() { 199 for { 200 // ECS task metadata publishes the full data (e.g. container.StartedAt) 201 // only after a while, so we need to keep trying to gather the full data 202 for i := 0; i < maximumRetries; i++ { 203 snapshot, ok := agent.collectSnapshot(context.Background()) 204 if ok { 205 agent.snapshot = snapshot 206 break 207 } 208 209 time.Sleep(expDelay(i + 1)) 210 } 211 time.Sleep(snapshotCollectionInterval) 212 } 213 }() 214 go agent.dockerStats.Run(context.Background(), time.Second) 215 go agent.processStats.Run(context.Background(), time.Second) 216 217 return agent 218 } 219 220 func (a *fargateAgent) Ready() bool { return a.snapshot.Service.EntityID != "" } 221 222 func (a *fargateAgent) SendMetrics(data acceptor.Metrics) (err error) { 223 dockerStats := a.dockerStats.Collect() 224 processStats := a.processStats.Collect() 225 defer func() { 226 if err == nil { 227 // only update the last sent stats if they were transmitted successfully 228 // since they are updated on the backend incrementally using received 229 // deltas 230 a.lastDockerStats = dockerStats 231 a.lastProcessStats = processStats 232 } 233 }() 234 235 payload := struct { 236 Metrics metricsPayload `json:"metrics,omitempty"` 237 Spans []Span `json:"spans,omitempty"` 238 }{ 239 Metrics: metricsPayload{ 240 Plugins: []acceptor.PluginPayload{ 241 newECSTaskPluginPayload(a.snapshot), 242 newProcessPluginPayload(a.snapshot.Service, a.lastProcessStats, processStats), 243 acceptor.NewGoProcessPluginPayload(acceptor.GoProcessData{ 244 PID: a.PID, 245 Snapshot: a.runtimeSnapshot.Collect(), 246 Metrics: data, 247 }), 248 }, 249 }, 250 } 251 252 for _, container := range a.snapshot.Task.Containers { 253 instrumented := ecsEntityID(container) == a.snapshot.Service.EntityID 254 payload.Metrics.Plugins = append( 255 payload.Metrics.Plugins, 256 newECSContainerPluginPayload(container, instrumented), 257 newDockerContainerPluginPayload( 258 container, 259 a.lastDockerStats[container.DockerID], 260 dockerStats[container.DockerID], 261 instrumented, 262 ), 263 ) 264 } 265 266 a.mu.Lock() 267 if len(a.spanQueue) > 0 { 268 payload.Spans = make([]Span, len(a.spanQueue)) 269 copy(payload.Spans, a.spanQueue) 270 a.spanQueue = a.spanQueue[:0] 271 } 272 a.mu.Unlock() 273 274 buf := bytes.NewBuffer(nil) 275 if err := json.NewEncoder(buf).Encode(payload); err != nil { 276 return fmt.Errorf("failed to marshal metrics payload: %s", err) 277 } 278 279 req, err := http.NewRequest(http.MethodPost, a.Endpoint+"/bundle", buf) 280 if err != nil { 281 return fmt.Errorf("failed to prepare send metrics request: %s", err) 282 } 283 284 req.Header.Set("Content-Type", "application/json") 285 286 return a.sendRequest(req) 287 } 288 289 func (a *fargateAgent) SendEvent(event *EventData) error { return nil } 290 291 func (a *fargateAgent) SendSpans(spans []Span) error { 292 from := newServerlessAgentFromS(a.snapshot.Service.EntityID, "aws") 293 for i := range spans { 294 spans[i].From = from 295 } 296 297 // enqueue the spans to send them in a bundle with metrics instead of sending immediately 298 a.mu.Lock() 299 a.spanQueue = append(a.spanQueue, spans...) 300 a.mu.Unlock() 301 302 return nil 303 } 304 305 func (a *fargateAgent) SendProfiles(profiles []autoprofile.Profile) error { return nil } 306 307 func (a *fargateAgent) Flush(ctx context.Context) error { 308 if len(a.spanQueue) == 0 { 309 return nil 310 } 311 312 if !a.Ready() { 313 return ErrAgentNotReady 314 } 315 316 a.mu.Lock() 317 defer a.mu.Unlock() 318 319 buf := bytes.NewBuffer(nil) 320 if err := json.NewEncoder(buf).Encode(a.spanQueue); err != nil { 321 return fmt.Errorf("failed to marshal traces payload: %s", err) 322 } 323 a.spanQueue = a.spanQueue[:0] 324 325 req, err := http.NewRequest(http.MethodPost, a.Endpoint+"/traces", buf) 326 if err != nil { 327 return fmt.Errorf("failed to prepare send traces request: %s", err) 328 } 329 330 req.Header.Set("Content-Type", "application/json") 331 332 return a.sendRequest(req.WithContext(ctx)) 333 } 334 335 func (a *fargateAgent) sendRequest(req *http.Request) error { 336 req.Header.Set("X-Instana-Host", a.snapshot.Service.EntityID) 337 req.Header.Set("X-Instana-Key", a.Key) 338 req.Header.Set("X-Instana-Time", strconv.FormatInt(time.Now().UnixNano()/int64(time.Millisecond), 10)) 339 340 resp, err := a.client.Do(req) 341 if err != nil { 342 return fmt.Errorf("failed to send request to the serverless agent: %s", err) 343 } 344 345 defer resp.Body.Close() 346 347 if resp.StatusCode >= http.StatusBadRequest { 348 respBody, err := ioutil.ReadAll(resp.Body) 349 if err != nil { 350 a.logger.Debug("failed to read serverless agent response: ", err) 351 return nil 352 } 353 354 a.logger.Info("serverless agent has responded with ", resp.Status, ": ", string(respBody)) 355 return nil 356 } 357 358 io.CopyN(ioutil.Discard, resp.Body, 1<<20) 359 360 return nil 361 } 362 363 func (a *fargateAgent) collectSnapshot(ctx context.Context) (fargateSnapshot, bool) { 364 var wg sync.WaitGroup 365 366 // fetch task metadata 367 wg.Add(1) 368 var taskMD aws.ECSTaskMetadata 369 go func() { 370 defer wg.Done() 371 372 var err error 373 taskMD, err = a.ecs.TaskMetadata(ctx) 374 if err != nil { 375 a.logger.Warn("failed to get task metadata: ", err) 376 } 377 }() 378 379 // fetch container metadata 380 wg.Add(1) 381 var containerMD aws.ECSContainerMetadata 382 go func() { 383 defer wg.Done() 384 385 var err error 386 containerMD, err = a.ecs.ContainerMetadata(ctx) 387 if err != nil { 388 a.logger.Warn("failed to get container metadata: ", err) 389 } 390 }() 391 392 wg.Wait() 393 394 // ensure that all metadata has been gathered 395 if taskMD.TaskARN == "" || containerMD.StartedAt.IsZero() { 396 a.logger.Error("snapshot collection failed (the metadata might not be ready yet)") 397 return fargateSnapshot{}, false 398 } 399 400 snapshot := newFargateSnapshot(a.PID, taskMD, containerMD) 401 snapshot.Service.Zone = a.Zone 402 snapshot.Service.Tags = a.Tags 403 404 a.logger.Debug("collected snapshot") 405 406 return snapshot, true 407 } 408 409 type ecsDockerStatsCollector struct { 410 ecs interface { 411 TaskStats(context.Context) (map[string]docker.ContainerStats, error) 412 } 413 logger LeveledLogger 414 415 mu sync.RWMutex 416 stats map[string]docker.ContainerStats 417 } 418 419 func (c *ecsDockerStatsCollector) Run(ctx context.Context, collectionInterval time.Duration) { 420 timer := time.NewTicker(collectionInterval) 421 defer timer.Stop() 422 423 for { 424 select { 425 case <-timer.C: 426 fetchCtx, cancel := context.WithTimeout(ctx, collectionInterval) 427 c.fetchStats(fetchCtx) 428 cancel() 429 case <-ctx.Done(): 430 return 431 } 432 } 433 } 434 435 func (c *ecsDockerStatsCollector) Collect() map[string]docker.ContainerStats { 436 c.mu.RLock() 437 defer c.mu.RUnlock() 438 439 return c.stats 440 } 441 442 func (c *ecsDockerStatsCollector) fetchStats(ctx context.Context) { 443 stats, err := c.ecs.TaskStats(ctx) 444 if err != nil { 445 if ctx.Err() != nil { 446 // request either timed out or had been cancelled, keep the old value 447 c.logger.Debug("failed to retrieve Docker container stats (timed out), skipping") 448 return 449 } 450 451 // request failed, reset recorded stats 452 c.logger.Warn("failed to retrieve Docker container stats: ", err) 453 stats = nil 454 } 455 456 c.mu.Lock() 457 c.stats = stats 458 defer c.mu.Unlock() 459 } 460 461 func ecsEntityID(md aws.ECSContainerMetadata) string { 462 return md.TaskARN + "::" + md.Name 463 }