github.com/tilt-dev/tilt@v0.33.15-0.20240515162809-0a22ed45d8a0/internal/controllers/core/podlogstream/podlogstreamcontroller.go (about)

     1  package podlogstream
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	"k8s.io/apimachinery/pkg/runtime"
    13  	"sigs.k8s.io/controller-runtime/pkg/builder"
    14  
    15  	"github.com/jonboulle/clockwork"
    16  
    17  	"github.com/tilt-dev/tilt/internal/controllers/indexer"
    18  	"github.com/tilt-dev/tilt/pkg/apis/core/v1alpha1"
    19  
    20  	v1 "k8s.io/api/core/v1"
    21  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    22  	"k8s.io/apimachinery/pkg/types"
    23  	ctrl "sigs.k8s.io/controller-runtime"
    24  	ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
    25  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    26  
    27  	"github.com/tilt-dev/tilt/internal/container"
    28  	"github.com/tilt-dev/tilt/internal/controllers/apicmp"
    29  	"github.com/tilt-dev/tilt/internal/engine/runtimelog"
    30  	"github.com/tilt-dev/tilt/internal/k8s"
    31  	"github.com/tilt-dev/tilt/internal/store"
    32  	"github.com/tilt-dev/tilt/internal/store/k8sconv"
    33  	"github.com/tilt-dev/tilt/pkg/logger"
    34  )
    35  
    36  var podLogHealthCheck = 15 * time.Second
    37  var podLogReconnectGap = 2 * time.Second
    38  
    39  const maxDebounceDuration = time.Minute
    40  
    41  var clusterGVK = v1alpha1.SchemeGroupVersion.WithKind("Cluster")
    42  
    43  // Reconciles the PodLogStream API object.
    44  //
    45  // Collects logs from deployed containers.
    46  type Controller struct {
    47  	ctx       context.Context
    48  	client    ctrlclient.Client
    49  	indexer   *indexer.Indexer
    50  	st        store.RStore
    51  	kClient   k8s.Client
    52  	podSource *PodSource
    53  	mu        sync.Mutex
    54  	clock     clockwork.Clock
    55  
    56  	watches         map[podLogKey]*podLogWatch
    57  	hasClosedStream map[podLogKey]bool
    58  	statuses        map[types.NamespacedName]*PodLogStreamStatus
    59  	debounces       map[types.NamespacedName]time.Duration
    60  }
    61  
    62  var _ reconcile.Reconciler = &Controller{}
    63  var _ store.TearDowner = &Controller{}
    64  
    65  func NewController(ctx context.Context, client ctrlclient.Client, scheme *runtime.Scheme, st store.RStore, kClient k8s.Client, podSource *PodSource, clock clockwork.Clock) *Controller {
    66  	return &Controller{
    67  		ctx:             ctx,
    68  		client:          client,
    69  		indexer:         indexer.NewIndexer(scheme, indexPodLogStreamForTiltAPI),
    70  		st:              st,
    71  		kClient:         kClient,
    72  		podSource:       podSource,
    73  		watches:         make(map[podLogKey]*podLogWatch),
    74  		hasClosedStream: make(map[podLogKey]bool),
    75  		statuses:        make(map[types.NamespacedName]*PodLogStreamStatus),
    76  		debounces:       make(map[types.NamespacedName]time.Duration),
    77  		clock:           clock,
    78  	}
    79  }
    80  
    81  // Filter containers based on the inclusions/exclusions in the PodLogStream spec.
    82  func (c *Controller) filterContainers(stream *PodLogStream, containers []v1alpha1.Container) []v1alpha1.Container {
    83  	if len(stream.Spec.OnlyContainers) > 0 {
    84  		only := make(map[container.Name]bool, len(stream.Spec.OnlyContainers))
    85  		for _, name := range stream.Spec.OnlyContainers {
    86  			only[container.Name(name)] = true
    87  		}
    88  
    89  		result := []v1alpha1.Container{}
    90  		for _, c := range containers {
    91  			if only[container.Name(c.Name)] {
    92  				result = append(result, c)
    93  			}
    94  		}
    95  		return result
    96  	}
    97  
    98  	if len(stream.Spec.IgnoreContainers) > 0 {
    99  		ignore := make(map[container.Name]bool, len(stream.Spec.IgnoreContainers))
   100  		for _, name := range stream.Spec.IgnoreContainers {
   101  			ignore[container.Name(name)] = true
   102  		}
   103  
   104  		result := []v1alpha1.Container{}
   105  		for _, c := range containers {
   106  			if !ignore[container.Name(c.Name)] {
   107  				result = append(result, c)
   108  			}
   109  		}
   110  		return result
   111  	}
   112  	return containers
   113  }
   114  
   115  func (c *Controller) TearDown(ctx context.Context) {
   116  	c.podSource.TearDown()
   117  }
   118  
   119  func (c *Controller) shouldStreamContainerLogs(pod *v1.Pod, co v1alpha1.Container, key podLogKey) bool {
   120  	if co.ID == "" {
   121  		return false
   122  	}
   123  
   124  	isTerminating := co.State.Terminated != nil ||
   125  		(pod.DeletionTimestamp != nil && !pod.DeletionTimestamp.IsZero())
   126  	if isTerminating && c.hasClosedStream[key] {
   127  		return false
   128  	}
   129  
   130  	if co.State.Running == nil && co.State.Terminated == nil {
   131  		// nothing to stream for containers in waiting state
   132  		return false
   133  	}
   134  
   135  	return true
   136  
   137  }
   138  
   139  // Reconcile the given stream against what we're currently tracking.
   140  func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
   141  	c.mu.Lock()
   142  	defer c.mu.Unlock()
   143  
   144  	stream := &PodLogStream{}
   145  	streamName := req.NamespacedName
   146  	err := c.client.Get(ctx, req.NamespacedName, stream)
   147  	c.indexer.OnReconcile(req.NamespacedName, stream)
   148  	if apierrors.IsNotFound(err) {
   149  		// handleReconcileRequest returns errors that should be published
   150  		// to status.error. But the pod log stream is deleted! so can ignore.
   151  		_ = c.podSource.handleReconcileRequest(ctx, req.NamespacedName, stream)
   152  		c.deleteStreams(streamName)
   153  		return reconcile.Result{}, nil
   154  	} else if err != nil {
   155  		return reconcile.Result{}, err
   156  	}
   157  
   158  	result := reconcile.Result{}
   159  	ctx = store.MustObjectLogHandler(ctx, c.st, stream)
   160  	err = c.podSource.handleReconcileRequest(ctx, streamName, stream)
   161  	if err != nil {
   162  		result = c.setErrorStatus(streamName, err)
   163  	} else {
   164  		podNN := types.NamespacedName{Name: stream.Spec.Pod, Namespace: stream.Spec.Namespace}
   165  		pod, err := c.kClient.PodFromInformerCache(ctx, podNN)
   166  		if err != nil && apierrors.IsNotFound(err) {
   167  			c.deleteStreams(streamName)
   168  			result = c.setErrorStatus(streamName, fmt.Errorf("pod not found: %s", podNN))
   169  		} else if err != nil {
   170  			result = c.setErrorStatus(streamName, fmt.Errorf("reading pod: %v", err))
   171  		} else if pod != nil {
   172  			result = c.addOrUpdateContainerWatches(ctx, streamName, stream, podNN, pod)
   173  		}
   174  	}
   175  
   176  	err = c.maybeUpdateObjectStatus(ctx, streamName, stream)
   177  	if err != nil {
   178  		return reconcile.Result{}, err
   179  	}
   180  
   181  	return result, nil
   182  }
   183  
   184  func (c *Controller) addOrUpdateContainerWatches(ctx context.Context, streamName types.NamespacedName, stream *v1alpha1.PodLogStream, podNN types.NamespacedName, pod *v1.Pod) reconcile.Result {
   185  	initContainers := c.filterContainers(stream, k8sconv.PodContainers(ctx, pod, pod.Status.InitContainerStatuses))
   186  	runContainers := c.filterContainers(stream, k8sconv.PodContainers(ctx, pod, pod.Status.ContainerStatuses))
   187  	containers := []v1alpha1.Container{}
   188  	containers = append(containers, initContainers...)
   189  	containers = append(containers, runContainers...)
   190  	c.ensureStatusActive(streamName, containers)
   191  
   192  	result := reconcile.Result{}
   193  	containerWatches := make(map[podLogKey]bool)
   194  	for i, co := range containers {
   195  		// Key the log watcher by the container id, so we auto-restart the
   196  		// watching if the container crashes.
   197  		key := podLogKey{
   198  			streamName: streamName,
   199  			podID:      k8s.PodID(podNN.Name),
   200  			cID:        container.ID(co.ID),
   201  		}
   202  		if !c.shouldStreamContainerLogs(pod, co, key) {
   203  			continue
   204  		}
   205  
   206  		isInitContainer := i < len(initContainers)
   207  
   208  		// We don't want to clutter the logs with a container name
   209  		// if it's unambiguous what container we're looking at.
   210  		//
   211  		// Long-term, we should make the container name a log field
   212  		// and have better ways to display it visually.
   213  		shouldPrefix := isInitContainer || len(runContainers) > 1
   214  
   215  		containerWatches[key] = true
   216  
   217  		existing, isActive := c.watches[key]
   218  
   219  		// Only stream logs that have happened since Tilt started.
   220  		//
   221  		// TODO(nick): We should really record when we started the `kubectl apply`,
   222  		// and only stream logs since that happened.
   223  		startWatchTime := time.Time{}
   224  		if stream.Spec.SinceTime != nil {
   225  			startWatchTime = stream.Spec.SinceTime.Time
   226  		}
   227  		debounce := time.Second
   228  
   229  		if isActive {
   230  			if existing.ctx.Err() == nil {
   231  				// The active pod watcher is still tailing the logs,
   232  				// nothing to do.
   233  				continue
   234  			}
   235  
   236  			// The active pod watcher got canceled somehow,
   237  			// so we need to create a new one that picks up
   238  			// where it left off.
   239  			<-existing.doneCh
   240  			startWatchTime = existing.doneWatchTime
   241  			debounce = existing.debounce
   242  			c.hasClosedStream[key] = true
   243  			if co.State.Terminated != nil {
   244  				c.mutateContainerStatus(streamName, container.Name(co.Name), func(cs *ContainerLogStreamStatus) {
   245  					cs.Terminated = true
   246  					cs.Active = false
   247  					cs.Error = ""
   248  				})
   249  				continue
   250  			}
   251  
   252  			if c.clock.Since(existing.doneWatchTime) < debounce {
   253  				requeueAfter := debounce - c.clock.Since(existing.doneWatchTime)
   254  				if requeueAfter > result.RequeueAfter {
   255  					result.RequeueAfter = requeueAfter
   256  				}
   257  				continue
   258  			}
   259  		}
   260  
   261  		ctx, cancel := context.WithCancel(ctx)
   262  		w := &podLogWatch{
   263  			streamName:     streamName,
   264  			ctx:            ctx,
   265  			cancel:         cancel,
   266  			podID:          k8s.PodID(podNN.Name),
   267  			cName:          container.Name(co.Name),
   268  			namespace:      k8s.Namespace(podNN.Namespace),
   269  			startWatchTime: startWatchTime,
   270  			debounce:       debounce,
   271  			doneCh:         make(chan struct{}),
   272  			shouldPrefix:   shouldPrefix,
   273  		}
   274  		c.watches[key] = w
   275  
   276  		go c.consumeLogs(w, c.st)
   277  	}
   278  
   279  	for key, watch := range c.watches {
   280  		_, inState := containerWatches[key]
   281  		if !inState && key.streamName == streamName {
   282  			watch.cancel()
   283  			delete(c.watches, key)
   284  		}
   285  	}
   286  	return result
   287  }
   288  
   289  // Delete all the streams generated by the named API object
   290  func (c *Controller) deleteStreams(streamName types.NamespacedName) {
   291  	for k, watch := range c.watches {
   292  		if k.streamName != streamName {
   293  			continue
   294  		}
   295  		watch.cancel()
   296  		delete(c.watches, k)
   297  	}
   298  
   299  	delete(c.statuses, streamName)
   300  }
   301  
   302  // Consume logs in a goroutine.
   303  //
   304  // Note that this does NOT have the lock, so anything in this function
   305  // needs to be careful about accessing shared state.
   306  func (c *Controller) consumeLogs(watch *podLogWatch, st store.RStore) {
   307  	pID := watch.podID
   308  	ctx := watch.ctx
   309  	containerName := watch.cName
   310  	var exitError error
   311  
   312  	defer func() {
   313  		// When the log streaming ends, log it and report the status change to the
   314  		// apiserver.
   315  		c.mu.Lock()
   316  		c.mutateContainerStatus(watch.streamName, containerName, func(cs *ContainerLogStreamStatus) {
   317  			cs.Active = false
   318  			if exitError == nil {
   319  				cs.Error = ""
   320  			} else {
   321  				cs.Error = exitError.Error()
   322  			}
   323  		})
   324  		c.mu.Unlock()
   325  		c.podSource.requeueStream(watch.streamName)
   326  
   327  		if exitError != nil {
   328  			// TODO(nick): Should this be Warnf/Errorf?
   329  			logger.Get(ctx).Infof("Error streaming %s logs: %v", pID, exitError)
   330  			watch.debounce *= 2
   331  			if watch.debounce > maxDebounceDuration {
   332  				watch.debounce = maxDebounceDuration
   333  			}
   334  		} else {
   335  			watch.debounce = time.Second
   336  		}
   337  
   338  		watch.doneWatchTime = c.clock.Now()
   339  		close(watch.doneCh)
   340  		watch.cancel()
   341  	}()
   342  
   343  	ns := watch.namespace
   344  	startReadTime := watch.startWatchTime
   345  	if watch.shouldPrefix {
   346  		prefix := fmt.Sprintf("[%s] ", watch.cName)
   347  		ctx = logger.WithLogger(ctx, logger.NewPrefixedLogger(prefix, logger.Get(ctx)))
   348  	}
   349  
   350  	retry := true
   351  	for retry {
   352  		retry = false
   353  		ctx, cancel := context.WithCancel(ctx)
   354  		readCloser, err := c.kClient.ContainerLogs(ctx, pID, containerName, ns, startReadTime)
   355  		if err != nil {
   356  			if ctx.Err() == nil {
   357  				exitError = err
   358  			}
   359  			cancel()
   360  			return
   361  		}
   362  
   363  		reader := runtimelog.NewHardCancelReader(ctx, readCloser)
   364  		reader.Now = c.clock.Now
   365  
   366  		// A hacky workaround for
   367  		// https://github.com/tilt-dev/tilt/issues/3908
   368  		// Every 15 seconds, check to see if the logs have stopped streaming.
   369  		// If they have, reconnect to the log stream.
   370  		done := make(chan bool)
   371  		go func() {
   372  			ticker := c.clock.NewTicker(podLogHealthCheck)
   373  			for {
   374  				select {
   375  				case <-done:
   376  					return
   377  
   378  				case <-ticker.Chan():
   379  					lastRead := reader.LastReadTime()
   380  					if lastRead.IsZero() || c.clock.Since(lastRead) < podLogHealthCheck {
   381  						continue
   382  					}
   383  
   384  					retry = true
   385  
   386  					// Start reading 2 seconds after the last read.
   387  					//
   388  					// In the common case (where we just haven't gotten any logs in the
   389  					// last 15 seconds), this will ensure we don't duplicate logs.
   390  					//
   391  					// In the uncommon case (where the Kubernetes log buffer exceeded 10MB
   392  					// and got rotated), this will create a 2 second gap in the log, but
   393  					// we think this is acceptable to avoid the duplicate case.
   394  					startReadTime = lastRead.Add(podLogReconnectGap)
   395  					cancel()
   396  					return
   397  				}
   398  			}
   399  		}()
   400  
   401  		c.mu.Lock()
   402  		c.mutateContainerStatus(watch.streamName, containerName, func(cs *ContainerLogStreamStatus) {
   403  			cs.Active = true
   404  			cs.Error = ""
   405  		})
   406  		c.mu.Unlock()
   407  		c.podSource.requeueStream(watch.streamName)
   408  
   409  		writer := &errorCapturingWriter{underlying: logger.Get(ctx).Writer(logger.InfoLvl)}
   410  		_, err = io.Copy(writer, reader)
   411  		_ = readCloser.Close()
   412  		close(done)
   413  
   414  		wasCanceledUpstream := ctx.Err() != nil
   415  		cancel()
   416  
   417  		newlineTerminated := writer.newlineTerminated
   418  		errorTerminated := writer.errorTerminated
   419  		if !newlineTerminated {
   420  			_, _ = writer.Write(newline)
   421  		}
   422  
   423  		if err == nil && errorTerminated != "" {
   424  			err = errors.New(errorTerminated)
   425  		}
   426  
   427  		if !retry && err != nil && !wasCanceledUpstream {
   428  			exitError = err
   429  			return
   430  		}
   431  	}
   432  }
   433  
   434  // Set up the status object for a particular stream, tracking each container individually.
   435  func (c *Controller) ensureStatusActive(streamName types.NamespacedName, containers []v1alpha1.Container) {
   436  	status, ok := c.statuses[streamName]
   437  	if !ok {
   438  		status = &PodLogStreamStatus{}
   439  		c.statuses[streamName] = status
   440  	}
   441  
   442  	// Make sure the container names are right. If they're not, delete everything and recreate.
   443  	isMatching := len(containers) == len(status.ContainerStatuses)
   444  	if isMatching {
   445  		for i, cs := range status.ContainerStatuses {
   446  			if containers[i].Name != cs.Name {
   447  				isMatching = false
   448  				break
   449  			}
   450  		}
   451  	}
   452  
   453  	if isMatching {
   454  		return
   455  	}
   456  
   457  	statuses := make([]ContainerLogStreamStatus, 0, len(containers))
   458  	for _, c := range containers {
   459  		statuses = append(statuses, ContainerLogStreamStatus{
   460  			Name: c.Name,
   461  		})
   462  	}
   463  	status.ContainerStatuses = statuses
   464  	status.Error = ""
   465  	delete(c.debounces, streamName)
   466  }
   467  
   468  // Modify the status of a container log stream.
   469  func (c *Controller) mutateContainerStatus(streamName types.NamespacedName, containerName container.Name, mutate func(*ContainerLogStreamStatus)) {
   470  	status, ok := c.statuses[streamName]
   471  	if !ok {
   472  		return
   473  	}
   474  
   475  	for i, cs := range status.ContainerStatuses {
   476  		if cs.Name != string(containerName) {
   477  			continue
   478  		}
   479  
   480  		mutate(&cs)
   481  		status.ContainerStatuses[i] = cs
   482  	}
   483  }
   484  
   485  // Set the pod log stream to an error status
   486  func (c *Controller) setErrorStatus(streamName types.NamespacedName, err error) reconcile.Result {
   487  	status, ok := c.statuses[streamName]
   488  	if !ok {
   489  		status = &PodLogStreamStatus{}
   490  		c.statuses[streamName] = status
   491  	}
   492  	if err == nil {
   493  		status.Error = ""
   494  		delete(c.debounces, streamName)
   495  	} else {
   496  		status.Error = err.Error()
   497  
   498  		prevDebounce := c.debounces[streamName]
   499  		newDebounce := prevDebounce * 2
   500  
   501  		if newDebounce == 0 {
   502  			newDebounce = time.Second
   503  		} else if newDebounce > maxDebounceDuration {
   504  			newDebounce = maxDebounceDuration
   505  		}
   506  		c.debounces[streamName] = newDebounce
   507  	}
   508  	return reconcile.Result{RequeueAfter: c.debounces[streamName]}
   509  }
   510  
   511  // Update the server with the current container status.
   512  func (c *Controller) maybeUpdateObjectStatus(ctx context.Context, nn types.NamespacedName, obj *v1alpha1.PodLogStream) error {
   513  	status, ok := c.statuses[nn]
   514  	if !ok || apicmp.DeepEqual(*status, obj.Status) {
   515  		return nil
   516  	}
   517  
   518  	oldError := obj.Status.Error
   519  	newError := status.Error
   520  	update := obj.DeepCopy()
   521  	status.DeepCopyInto(&update.Status)
   522  
   523  	err := c.client.Status().Update(c.ctx, update)
   524  	if err != nil {
   525  		return err
   526  	}
   527  
   528  	// Only show new errors.
   529  	//
   530  	// Don't show errors about the pod not being found. Those are pretty common
   531  	// when the pod hasn't shown up in the informer cache yet.
   532  	if newError != "" && !strings.HasPrefix(newError, "pod not found:") && newError != oldError {
   533  		logger.Get(ctx).Errorf("podlogstream %s: %s", obj.Name, newError)
   534  	}
   535  	return nil
   536  }
   537  
   538  func (c *Controller) CreateBuilder(mgr ctrl.Manager) (*builder.Builder, error) {
   539  	b := ctrl.NewControllerManagedBy(mgr).
   540  		For(&PodLogStream{}).
   541  		WatchesRawSource(c.podSource)
   542  
   543  	return b, nil
   544  }
   545  
   546  type podLogWatch struct {
   547  	ctx    context.Context
   548  	cancel func()
   549  
   550  	streamName     types.NamespacedName
   551  	podID          k8s.PodID
   552  	namespace      k8s.Namespace
   553  	cName          container.Name
   554  	startWatchTime time.Time
   555  	doneWatchTime  time.Time
   556  	debounce       time.Duration
   557  	doneCh         chan struct{}
   558  
   559  	shouldPrefix bool // if true, we'll prefix logs with the container name
   560  }
   561  
   562  type podLogKey struct {
   563  	streamName types.NamespacedName
   564  	podID      k8s.PodID
   565  	cID        container.ID
   566  }
   567  
   568  // indexPodLogStreamForTiltAPI indexes a PodLogStream object and returns keys
   569  // for objects from the Tilt apiserver that it watches.
   570  //
   571  // See also: indexPodLogStreamForKubernetes which indexes a PodLogStream object
   572  // and returns keys for objects from the Kubernetes cluster that it watches via
   573  // PodSource.
   574  func indexPodLogStreamForTiltAPI(obj ctrlclient.Object) []indexer.Key {
   575  	var results []indexer.Key
   576  	pls := obj.(*v1alpha1.PodLogStream)
   577  	if pls != nil && pls.Spec.Cluster != "" {
   578  		results = append(results, indexer.Key{
   579  			Name: types.NamespacedName{Namespace: pls.Namespace, Name: pls.Spec.Cluster},
   580  			GVK:  clusterGVK,
   581  		})
   582  	}
   583  	return results
   584  }