github.com/tilt-dev/tilt@v0.33.15-0.20240515162809-0a22ed45d8a0/internal/controllers/core/cluster/monitor.go (about) 1 package cluster 2 3 import ( 4 "context" 5 "errors" 6 "sync" 7 8 "github.com/jonboulle/clockwork" 9 "k8s.io/apimachinery/pkg/types" 10 11 "github.com/tilt-dev/tilt/internal/controllers/indexer" 12 "github.com/tilt-dev/tilt/internal/k8s" 13 ) 14 15 type clusterHealthMonitor struct { 16 mu sync.Mutex 17 globalCtx context.Context 18 clock clockwork.Clock 19 requeuer *indexer.Requeuer 20 monitors map[types.NamespacedName]monitor 21 } 22 23 func newClusterHealthMonitor(globalCtx context.Context, clock clockwork.Clock, requeuer *indexer.Requeuer) *clusterHealthMonitor { 24 return &clusterHealthMonitor{ 25 globalCtx: globalCtx, 26 clock: clock, 27 requeuer: requeuer, 28 monitors: make(map[types.NamespacedName]monitor), 29 } 30 } 31 32 func (c *clusterHealthMonitor) Start(clusterNN types.NamespacedName, conn connection) context.Context { 33 c.mu.Lock() 34 defer c.mu.Unlock() 35 36 c.cleanup(clusterNN) 37 ctx, cancel := context.WithCancel(c.globalCtx) 38 c.monitors[clusterNN] = monitor{cancel: cancel} 39 go c.run(ctx, clusterNN, conn) 40 41 return ctx 42 } 43 44 func (c *clusterHealthMonitor) GetStatus(clusterNN types.NamespacedName) string { 45 c.mu.Lock() 46 defer c.mu.Unlock() 47 return c.monitors[clusterNN].error 48 } 49 50 func (c *clusterHealthMonitor) UpdateStatus(ctx context.Context, clusterNN types.NamespacedName, error string) { 51 c.mu.Lock() 52 defer c.mu.Unlock() 53 54 if ctx.Err() != nil { 55 // if the context as canceled while the health check was running, 56 // it might be the cause of the error, which isn't actually a health 57 // check failure; it's also possible we'd be doing a stale update 58 return 59 } 60 61 if m, ok := c.monitors[clusterNN]; ok { 62 if m.error == error { 63 return 64 } 65 m.error = error 66 c.monitors[clusterNN] = m 67 c.requeuer.Add(clusterNN) 68 } 69 } 70 71 func (c *clusterHealthMonitor) Stop(clusterNN types.NamespacedName) { 72 c.mu.Lock() 73 defer c.mu.Unlock() 74 75 c.cleanup(clusterNN) 76 delete(c.monitors, clusterNN) 77 } 78 79 func (c *clusterHealthMonitor) cleanup(clusterNN types.NamespacedName) { 80 m := c.monitors[clusterNN] 81 if m.cancel != nil { 82 m.cancel() 83 } 84 } 85 86 type monitor struct { 87 cancel context.CancelFunc 88 error string 89 } 90 91 func (c *clusterHealthMonitor) run(ctx context.Context, clusterNN types.NamespacedName, conn connection) { 92 if conn.connType != connectionTypeK8s { 93 // live connection monitoring for Docker not yet supported 94 return 95 } 96 97 ticker := c.clock.NewTicker(clientHealthPollInterval) 98 defer ticker.Stop() 99 for { 100 err := doKubernetesHealthCheck(ctx, conn.k8sClient) 101 if err != nil { 102 c.UpdateStatus(ctx, clusterNN, err.Error()) 103 } else { 104 c.UpdateStatus(ctx, clusterNN, "") 105 } 106 107 select { 108 case <-ticker.Chan(): 109 case <-ctx.Done(): 110 return 111 } 112 } 113 } 114 115 func doKubernetesHealthCheck(ctx context.Context, client k8s.Client) error { 116 // TODO(milas): use verbose=true and propagate the info to the Tilt API 117 // cluster obj to show in the web UI 118 health, err := client.ClusterHealth(ctx, false) 119 if err != nil { 120 return err 121 } 122 123 if !health.Live { 124 return errors.New("cluster did not pass liveness check") 125 } 126 127 if !health.Ready { 128 return errors.New("cluster not ready") 129 } 130 131 return nil 132 }