github.com/tilt-dev/tilt@v0.33.15-0.20240515162809-0a22ed45d8a0/internal/controllers/core/cluster/monitor.go (about)

     1  package cluster
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"sync"
     7  
     8  	"github.com/jonboulle/clockwork"
     9  	"k8s.io/apimachinery/pkg/types"
    10  
    11  	"github.com/tilt-dev/tilt/internal/controllers/indexer"
    12  	"github.com/tilt-dev/tilt/internal/k8s"
    13  )
    14  
    15  type clusterHealthMonitor struct {
    16  	mu        sync.Mutex
    17  	globalCtx context.Context
    18  	clock     clockwork.Clock
    19  	requeuer  *indexer.Requeuer
    20  	monitors  map[types.NamespacedName]monitor
    21  }
    22  
    23  func newClusterHealthMonitor(globalCtx context.Context, clock clockwork.Clock, requeuer *indexer.Requeuer) *clusterHealthMonitor {
    24  	return &clusterHealthMonitor{
    25  		globalCtx: globalCtx,
    26  		clock:     clock,
    27  		requeuer:  requeuer,
    28  		monitors:  make(map[types.NamespacedName]monitor),
    29  	}
    30  }
    31  
    32  func (c *clusterHealthMonitor) Start(clusterNN types.NamespacedName, conn connection) context.Context {
    33  	c.mu.Lock()
    34  	defer c.mu.Unlock()
    35  
    36  	c.cleanup(clusterNN)
    37  	ctx, cancel := context.WithCancel(c.globalCtx)
    38  	c.monitors[clusterNN] = monitor{cancel: cancel}
    39  	go c.run(ctx, clusterNN, conn)
    40  
    41  	return ctx
    42  }
    43  
    44  func (c *clusterHealthMonitor) GetStatus(clusterNN types.NamespacedName) string {
    45  	c.mu.Lock()
    46  	defer c.mu.Unlock()
    47  	return c.monitors[clusterNN].error
    48  }
    49  
    50  func (c *clusterHealthMonitor) UpdateStatus(ctx context.Context, clusterNN types.NamespacedName, error string) {
    51  	c.mu.Lock()
    52  	defer c.mu.Unlock()
    53  
    54  	if ctx.Err() != nil {
    55  		// if the context as canceled while the health check was running,
    56  		// it might be the cause of the error, which isn't actually a health
    57  		// check failure; it's also possible we'd be doing a stale update
    58  		return
    59  	}
    60  
    61  	if m, ok := c.monitors[clusterNN]; ok {
    62  		if m.error == error {
    63  			return
    64  		}
    65  		m.error = error
    66  		c.monitors[clusterNN] = m
    67  		c.requeuer.Add(clusterNN)
    68  	}
    69  }
    70  
    71  func (c *clusterHealthMonitor) Stop(clusterNN types.NamespacedName) {
    72  	c.mu.Lock()
    73  	defer c.mu.Unlock()
    74  
    75  	c.cleanup(clusterNN)
    76  	delete(c.monitors, clusterNN)
    77  }
    78  
    79  func (c *clusterHealthMonitor) cleanup(clusterNN types.NamespacedName) {
    80  	m := c.monitors[clusterNN]
    81  	if m.cancel != nil {
    82  		m.cancel()
    83  	}
    84  }
    85  
    86  type monitor struct {
    87  	cancel context.CancelFunc
    88  	error  string
    89  }
    90  
    91  func (c *clusterHealthMonitor) run(ctx context.Context, clusterNN types.NamespacedName, conn connection) {
    92  	if conn.connType != connectionTypeK8s {
    93  		// live connection monitoring for Docker not yet supported
    94  		return
    95  	}
    96  
    97  	ticker := c.clock.NewTicker(clientHealthPollInterval)
    98  	defer ticker.Stop()
    99  	for {
   100  		err := doKubernetesHealthCheck(ctx, conn.k8sClient)
   101  		if err != nil {
   102  			c.UpdateStatus(ctx, clusterNN, err.Error())
   103  		} else {
   104  			c.UpdateStatus(ctx, clusterNN, "")
   105  		}
   106  
   107  		select {
   108  		case <-ticker.Chan():
   109  		case <-ctx.Done():
   110  			return
   111  		}
   112  	}
   113  }
   114  
   115  func doKubernetesHealthCheck(ctx context.Context, client k8s.Client) error {
   116  	// TODO(milas): use verbose=true and propagate the info to the Tilt API
   117  	// 	cluster obj to show in the web UI
   118  	health, err := client.ClusterHealth(ctx, false)
   119  	if err != nil {
   120  		return err
   121  	}
   122  
   123  	if !health.Live {
   124  		return errors.New("cluster did not pass liveness check")
   125  	}
   126  
   127  	if !health.Ready {
   128  		return errors.New("cluster not ready")
   129  	}
   130  
   131  	return nil
   132  }