github.com/tilt-dev/tilt@v0.33.15-0.20240515162809-0a22ed45d8a0/internal/controllers/core/podlogstream/podlogstreamcontroller.go (about) 1 package podlogstream 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "io" 8 "strings" 9 "sync" 10 "time" 11 12 "k8s.io/apimachinery/pkg/runtime" 13 "sigs.k8s.io/controller-runtime/pkg/builder" 14 15 "github.com/jonboulle/clockwork" 16 17 "github.com/tilt-dev/tilt/internal/controllers/indexer" 18 "github.com/tilt-dev/tilt/pkg/apis/core/v1alpha1" 19 20 v1 "k8s.io/api/core/v1" 21 apierrors "k8s.io/apimachinery/pkg/api/errors" 22 "k8s.io/apimachinery/pkg/types" 23 ctrl "sigs.k8s.io/controller-runtime" 24 ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" 25 "sigs.k8s.io/controller-runtime/pkg/reconcile" 26 27 "github.com/tilt-dev/tilt/internal/container" 28 "github.com/tilt-dev/tilt/internal/controllers/apicmp" 29 "github.com/tilt-dev/tilt/internal/engine/runtimelog" 30 "github.com/tilt-dev/tilt/internal/k8s" 31 "github.com/tilt-dev/tilt/internal/store" 32 "github.com/tilt-dev/tilt/internal/store/k8sconv" 33 "github.com/tilt-dev/tilt/pkg/logger" 34 ) 35 36 var podLogHealthCheck = 15 * time.Second 37 var podLogReconnectGap = 2 * time.Second 38 39 const maxDebounceDuration = time.Minute 40 41 var clusterGVK = v1alpha1.SchemeGroupVersion.WithKind("Cluster") 42 43 // Reconciles the PodLogStream API object. 44 // 45 // Collects logs from deployed containers. 46 type Controller struct { 47 ctx context.Context 48 client ctrlclient.Client 49 indexer *indexer.Indexer 50 st store.RStore 51 kClient k8s.Client 52 podSource *PodSource 53 mu sync.Mutex 54 clock clockwork.Clock 55 56 watches map[podLogKey]*podLogWatch 57 hasClosedStream map[podLogKey]bool 58 statuses map[types.NamespacedName]*PodLogStreamStatus 59 debounces map[types.NamespacedName]time.Duration 60 } 61 62 var _ reconcile.Reconciler = &Controller{} 63 var _ store.TearDowner = &Controller{} 64 65 func NewController(ctx context.Context, client ctrlclient.Client, scheme *runtime.Scheme, st store.RStore, kClient k8s.Client, podSource *PodSource, clock clockwork.Clock) *Controller { 66 return &Controller{ 67 ctx: ctx, 68 client: client, 69 indexer: indexer.NewIndexer(scheme, indexPodLogStreamForTiltAPI), 70 st: st, 71 kClient: kClient, 72 podSource: podSource, 73 watches: make(map[podLogKey]*podLogWatch), 74 hasClosedStream: make(map[podLogKey]bool), 75 statuses: make(map[types.NamespacedName]*PodLogStreamStatus), 76 debounces: make(map[types.NamespacedName]time.Duration), 77 clock: clock, 78 } 79 } 80 81 // Filter containers based on the inclusions/exclusions in the PodLogStream spec. 82 func (c *Controller) filterContainers(stream *PodLogStream, containers []v1alpha1.Container) []v1alpha1.Container { 83 if len(stream.Spec.OnlyContainers) > 0 { 84 only := make(map[container.Name]bool, len(stream.Spec.OnlyContainers)) 85 for _, name := range stream.Spec.OnlyContainers { 86 only[container.Name(name)] = true 87 } 88 89 result := []v1alpha1.Container{} 90 for _, c := range containers { 91 if only[container.Name(c.Name)] { 92 result = append(result, c) 93 } 94 } 95 return result 96 } 97 98 if len(stream.Spec.IgnoreContainers) > 0 { 99 ignore := make(map[container.Name]bool, len(stream.Spec.IgnoreContainers)) 100 for _, name := range stream.Spec.IgnoreContainers { 101 ignore[container.Name(name)] = true 102 } 103 104 result := []v1alpha1.Container{} 105 for _, c := range containers { 106 if !ignore[container.Name(c.Name)] { 107 result = append(result, c) 108 } 109 } 110 return result 111 } 112 return containers 113 } 114 115 func (c *Controller) TearDown(ctx context.Context) { 116 c.podSource.TearDown() 117 } 118 119 func (c *Controller) shouldStreamContainerLogs(pod *v1.Pod, co v1alpha1.Container, key podLogKey) bool { 120 if co.ID == "" { 121 return false 122 } 123 124 isTerminating := co.State.Terminated != nil || 125 (pod.DeletionTimestamp != nil && !pod.DeletionTimestamp.IsZero()) 126 if isTerminating && c.hasClosedStream[key] { 127 return false 128 } 129 130 if co.State.Running == nil && co.State.Terminated == nil { 131 // nothing to stream for containers in waiting state 132 return false 133 } 134 135 return true 136 137 } 138 139 // Reconcile the given stream against what we're currently tracking. 140 func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { 141 c.mu.Lock() 142 defer c.mu.Unlock() 143 144 stream := &PodLogStream{} 145 streamName := req.NamespacedName 146 err := c.client.Get(ctx, req.NamespacedName, stream) 147 c.indexer.OnReconcile(req.NamespacedName, stream) 148 if apierrors.IsNotFound(err) { 149 // handleReconcileRequest returns errors that should be published 150 // to status.error. But the pod log stream is deleted! so can ignore. 151 _ = c.podSource.handleReconcileRequest(ctx, req.NamespacedName, stream) 152 c.deleteStreams(streamName) 153 return reconcile.Result{}, nil 154 } else if err != nil { 155 return reconcile.Result{}, err 156 } 157 158 result := reconcile.Result{} 159 ctx = store.MustObjectLogHandler(ctx, c.st, stream) 160 err = c.podSource.handleReconcileRequest(ctx, streamName, stream) 161 if err != nil { 162 result = c.setErrorStatus(streamName, err) 163 } else { 164 podNN := types.NamespacedName{Name: stream.Spec.Pod, Namespace: stream.Spec.Namespace} 165 pod, err := c.kClient.PodFromInformerCache(ctx, podNN) 166 if err != nil && apierrors.IsNotFound(err) { 167 c.deleteStreams(streamName) 168 result = c.setErrorStatus(streamName, fmt.Errorf("pod not found: %s", podNN)) 169 } else if err != nil { 170 result = c.setErrorStatus(streamName, fmt.Errorf("reading pod: %v", err)) 171 } else if pod != nil { 172 result = c.addOrUpdateContainerWatches(ctx, streamName, stream, podNN, pod) 173 } 174 } 175 176 err = c.maybeUpdateObjectStatus(ctx, streamName, stream) 177 if err != nil { 178 return reconcile.Result{}, err 179 } 180 181 return result, nil 182 } 183 184 func (c *Controller) addOrUpdateContainerWatches(ctx context.Context, streamName types.NamespacedName, stream *v1alpha1.PodLogStream, podNN types.NamespacedName, pod *v1.Pod) reconcile.Result { 185 initContainers := c.filterContainers(stream, k8sconv.PodContainers(ctx, pod, pod.Status.InitContainerStatuses)) 186 runContainers := c.filterContainers(stream, k8sconv.PodContainers(ctx, pod, pod.Status.ContainerStatuses)) 187 containers := []v1alpha1.Container{} 188 containers = append(containers, initContainers...) 189 containers = append(containers, runContainers...) 190 c.ensureStatusActive(streamName, containers) 191 192 result := reconcile.Result{} 193 containerWatches := make(map[podLogKey]bool) 194 for i, co := range containers { 195 // Key the log watcher by the container id, so we auto-restart the 196 // watching if the container crashes. 197 key := podLogKey{ 198 streamName: streamName, 199 podID: k8s.PodID(podNN.Name), 200 cID: container.ID(co.ID), 201 } 202 if !c.shouldStreamContainerLogs(pod, co, key) { 203 continue 204 } 205 206 isInitContainer := i < len(initContainers) 207 208 // We don't want to clutter the logs with a container name 209 // if it's unambiguous what container we're looking at. 210 // 211 // Long-term, we should make the container name a log field 212 // and have better ways to display it visually. 213 shouldPrefix := isInitContainer || len(runContainers) > 1 214 215 containerWatches[key] = true 216 217 existing, isActive := c.watches[key] 218 219 // Only stream logs that have happened since Tilt started. 220 // 221 // TODO(nick): We should really record when we started the `kubectl apply`, 222 // and only stream logs since that happened. 223 startWatchTime := time.Time{} 224 if stream.Spec.SinceTime != nil { 225 startWatchTime = stream.Spec.SinceTime.Time 226 } 227 debounce := time.Second 228 229 if isActive { 230 if existing.ctx.Err() == nil { 231 // The active pod watcher is still tailing the logs, 232 // nothing to do. 233 continue 234 } 235 236 // The active pod watcher got canceled somehow, 237 // so we need to create a new one that picks up 238 // where it left off. 239 <-existing.doneCh 240 startWatchTime = existing.doneWatchTime 241 debounce = existing.debounce 242 c.hasClosedStream[key] = true 243 if co.State.Terminated != nil { 244 c.mutateContainerStatus(streamName, container.Name(co.Name), func(cs *ContainerLogStreamStatus) { 245 cs.Terminated = true 246 cs.Active = false 247 cs.Error = "" 248 }) 249 continue 250 } 251 252 if c.clock.Since(existing.doneWatchTime) < debounce { 253 requeueAfter := debounce - c.clock.Since(existing.doneWatchTime) 254 if requeueAfter > result.RequeueAfter { 255 result.RequeueAfter = requeueAfter 256 } 257 continue 258 } 259 } 260 261 ctx, cancel := context.WithCancel(ctx) 262 w := &podLogWatch{ 263 streamName: streamName, 264 ctx: ctx, 265 cancel: cancel, 266 podID: k8s.PodID(podNN.Name), 267 cName: container.Name(co.Name), 268 namespace: k8s.Namespace(podNN.Namespace), 269 startWatchTime: startWatchTime, 270 debounce: debounce, 271 doneCh: make(chan struct{}), 272 shouldPrefix: shouldPrefix, 273 } 274 c.watches[key] = w 275 276 go c.consumeLogs(w, c.st) 277 } 278 279 for key, watch := range c.watches { 280 _, inState := containerWatches[key] 281 if !inState && key.streamName == streamName { 282 watch.cancel() 283 delete(c.watches, key) 284 } 285 } 286 return result 287 } 288 289 // Delete all the streams generated by the named API object 290 func (c *Controller) deleteStreams(streamName types.NamespacedName) { 291 for k, watch := range c.watches { 292 if k.streamName != streamName { 293 continue 294 } 295 watch.cancel() 296 delete(c.watches, k) 297 } 298 299 delete(c.statuses, streamName) 300 } 301 302 // Consume logs in a goroutine. 303 // 304 // Note that this does NOT have the lock, so anything in this function 305 // needs to be careful about accessing shared state. 306 func (c *Controller) consumeLogs(watch *podLogWatch, st store.RStore) { 307 pID := watch.podID 308 ctx := watch.ctx 309 containerName := watch.cName 310 var exitError error 311 312 defer func() { 313 // When the log streaming ends, log it and report the status change to the 314 // apiserver. 315 c.mu.Lock() 316 c.mutateContainerStatus(watch.streamName, containerName, func(cs *ContainerLogStreamStatus) { 317 cs.Active = false 318 if exitError == nil { 319 cs.Error = "" 320 } else { 321 cs.Error = exitError.Error() 322 } 323 }) 324 c.mu.Unlock() 325 c.podSource.requeueStream(watch.streamName) 326 327 if exitError != nil { 328 // TODO(nick): Should this be Warnf/Errorf? 329 logger.Get(ctx).Infof("Error streaming %s logs: %v", pID, exitError) 330 watch.debounce *= 2 331 if watch.debounce > maxDebounceDuration { 332 watch.debounce = maxDebounceDuration 333 } 334 } else { 335 watch.debounce = time.Second 336 } 337 338 watch.doneWatchTime = c.clock.Now() 339 close(watch.doneCh) 340 watch.cancel() 341 }() 342 343 ns := watch.namespace 344 startReadTime := watch.startWatchTime 345 if watch.shouldPrefix { 346 prefix := fmt.Sprintf("[%s] ", watch.cName) 347 ctx = logger.WithLogger(ctx, logger.NewPrefixedLogger(prefix, logger.Get(ctx))) 348 } 349 350 retry := true 351 for retry { 352 retry = false 353 ctx, cancel := context.WithCancel(ctx) 354 readCloser, err := c.kClient.ContainerLogs(ctx, pID, containerName, ns, startReadTime) 355 if err != nil { 356 if ctx.Err() == nil { 357 exitError = err 358 } 359 cancel() 360 return 361 } 362 363 reader := runtimelog.NewHardCancelReader(ctx, readCloser) 364 reader.Now = c.clock.Now 365 366 // A hacky workaround for 367 // https://github.com/tilt-dev/tilt/issues/3908 368 // Every 15 seconds, check to see if the logs have stopped streaming. 369 // If they have, reconnect to the log stream. 370 done := make(chan bool) 371 go func() { 372 ticker := c.clock.NewTicker(podLogHealthCheck) 373 for { 374 select { 375 case <-done: 376 return 377 378 case <-ticker.Chan(): 379 lastRead := reader.LastReadTime() 380 if lastRead.IsZero() || c.clock.Since(lastRead) < podLogHealthCheck { 381 continue 382 } 383 384 retry = true 385 386 // Start reading 2 seconds after the last read. 387 // 388 // In the common case (where we just haven't gotten any logs in the 389 // last 15 seconds), this will ensure we don't duplicate logs. 390 // 391 // In the uncommon case (where the Kubernetes log buffer exceeded 10MB 392 // and got rotated), this will create a 2 second gap in the log, but 393 // we think this is acceptable to avoid the duplicate case. 394 startReadTime = lastRead.Add(podLogReconnectGap) 395 cancel() 396 return 397 } 398 } 399 }() 400 401 c.mu.Lock() 402 c.mutateContainerStatus(watch.streamName, containerName, func(cs *ContainerLogStreamStatus) { 403 cs.Active = true 404 cs.Error = "" 405 }) 406 c.mu.Unlock() 407 c.podSource.requeueStream(watch.streamName) 408 409 writer := &errorCapturingWriter{underlying: logger.Get(ctx).Writer(logger.InfoLvl)} 410 _, err = io.Copy(writer, reader) 411 _ = readCloser.Close() 412 close(done) 413 414 wasCanceledUpstream := ctx.Err() != nil 415 cancel() 416 417 newlineTerminated := writer.newlineTerminated 418 errorTerminated := writer.errorTerminated 419 if !newlineTerminated { 420 _, _ = writer.Write(newline) 421 } 422 423 if err == nil && errorTerminated != "" { 424 err = errors.New(errorTerminated) 425 } 426 427 if !retry && err != nil && !wasCanceledUpstream { 428 exitError = err 429 return 430 } 431 } 432 } 433 434 // Set up the status object for a particular stream, tracking each container individually. 435 func (c *Controller) ensureStatusActive(streamName types.NamespacedName, containers []v1alpha1.Container) { 436 status, ok := c.statuses[streamName] 437 if !ok { 438 status = &PodLogStreamStatus{} 439 c.statuses[streamName] = status 440 } 441 442 // Make sure the container names are right. If they're not, delete everything and recreate. 443 isMatching := len(containers) == len(status.ContainerStatuses) 444 if isMatching { 445 for i, cs := range status.ContainerStatuses { 446 if containers[i].Name != cs.Name { 447 isMatching = false 448 break 449 } 450 } 451 } 452 453 if isMatching { 454 return 455 } 456 457 statuses := make([]ContainerLogStreamStatus, 0, len(containers)) 458 for _, c := range containers { 459 statuses = append(statuses, ContainerLogStreamStatus{ 460 Name: c.Name, 461 }) 462 } 463 status.ContainerStatuses = statuses 464 status.Error = "" 465 delete(c.debounces, streamName) 466 } 467 468 // Modify the status of a container log stream. 469 func (c *Controller) mutateContainerStatus(streamName types.NamespacedName, containerName container.Name, mutate func(*ContainerLogStreamStatus)) { 470 status, ok := c.statuses[streamName] 471 if !ok { 472 return 473 } 474 475 for i, cs := range status.ContainerStatuses { 476 if cs.Name != string(containerName) { 477 continue 478 } 479 480 mutate(&cs) 481 status.ContainerStatuses[i] = cs 482 } 483 } 484 485 // Set the pod log stream to an error status 486 func (c *Controller) setErrorStatus(streamName types.NamespacedName, err error) reconcile.Result { 487 status, ok := c.statuses[streamName] 488 if !ok { 489 status = &PodLogStreamStatus{} 490 c.statuses[streamName] = status 491 } 492 if err == nil { 493 status.Error = "" 494 delete(c.debounces, streamName) 495 } else { 496 status.Error = err.Error() 497 498 prevDebounce := c.debounces[streamName] 499 newDebounce := prevDebounce * 2 500 501 if newDebounce == 0 { 502 newDebounce = time.Second 503 } else if newDebounce > maxDebounceDuration { 504 newDebounce = maxDebounceDuration 505 } 506 c.debounces[streamName] = newDebounce 507 } 508 return reconcile.Result{RequeueAfter: c.debounces[streamName]} 509 } 510 511 // Update the server with the current container status. 512 func (c *Controller) maybeUpdateObjectStatus(ctx context.Context, nn types.NamespacedName, obj *v1alpha1.PodLogStream) error { 513 status, ok := c.statuses[nn] 514 if !ok || apicmp.DeepEqual(*status, obj.Status) { 515 return nil 516 } 517 518 oldError := obj.Status.Error 519 newError := status.Error 520 update := obj.DeepCopy() 521 status.DeepCopyInto(&update.Status) 522 523 err := c.client.Status().Update(c.ctx, update) 524 if err != nil { 525 return err 526 } 527 528 // Only show new errors. 529 // 530 // Don't show errors about the pod not being found. Those are pretty common 531 // when the pod hasn't shown up in the informer cache yet. 532 if newError != "" && !strings.HasPrefix(newError, "pod not found:") && newError != oldError { 533 logger.Get(ctx).Errorf("podlogstream %s: %s", obj.Name, newError) 534 } 535 return nil 536 } 537 538 func (c *Controller) CreateBuilder(mgr ctrl.Manager) (*builder.Builder, error) { 539 b := ctrl.NewControllerManagedBy(mgr). 540 For(&PodLogStream{}). 541 WatchesRawSource(c.podSource) 542 543 return b, nil 544 } 545 546 type podLogWatch struct { 547 ctx context.Context 548 cancel func() 549 550 streamName types.NamespacedName 551 podID k8s.PodID 552 namespace k8s.Namespace 553 cName container.Name 554 startWatchTime time.Time 555 doneWatchTime time.Time 556 debounce time.Duration 557 doneCh chan struct{} 558 559 shouldPrefix bool // if true, we'll prefix logs with the container name 560 } 561 562 type podLogKey struct { 563 streamName types.NamespacedName 564 podID k8s.PodID 565 cID container.ID 566 } 567 568 // indexPodLogStreamForTiltAPI indexes a PodLogStream object and returns keys 569 // for objects from the Tilt apiserver that it watches. 570 // 571 // See also: indexPodLogStreamForKubernetes which indexes a PodLogStream object 572 // and returns keys for objects from the Kubernetes cluster that it watches via 573 // PodSource. 574 func indexPodLogStreamForTiltAPI(obj ctrlclient.Object) []indexer.Key { 575 var results []indexer.Key 576 pls := obj.(*v1alpha1.PodLogStream) 577 if pls != nil && pls.Spec.Cluster != "" { 578 results = append(results, indexer.Key{ 579 Name: types.NamespacedName{Namespace: pls.Namespace, Name: pls.Spec.Cluster}, 580 GVK: clusterGVK, 581 }) 582 } 583 return results 584 }