github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/checks_hook.go (about) 1 package allocrunner 2 3 import ( 4 "context" 5 "sync" 6 "time" 7 8 "github.com/hashicorp/go-hclog" 9 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 10 "github.com/hashicorp/nomad/client/serviceregistration/checks" 11 "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" 12 "github.com/hashicorp/nomad/helper" 13 "github.com/hashicorp/nomad/nomad/structs" 14 ) 15 16 const ( 17 // checksHookName is the name of this hook as appears in logs 18 checksHookName = "checks_hook" 19 ) 20 21 // observers maintains a map from check_id -> observer for a particular check. Each 22 // observer in the map must share the same context. 23 type observers map[structs.CheckID]*observer 24 25 // An observer is used to execute a particular check on its interval and update the 26 // check store with those results. 27 type observer struct { 28 ctx context.Context 29 cancel context.CancelFunc 30 checker checks.Checker 31 checkStore checkstore.Shim 32 33 qc *checks.QueryContext 34 check *structs.ServiceCheck 35 allocID string 36 } 37 38 // start checking our check on its interval 39 func (o *observer) start() { 40 // compromise between immediate (too early) and waiting full interval (slow) 41 firstWait := o.check.Interval / 2 42 43 timer, cancel := helper.NewSafeTimer(firstWait) 44 defer cancel() 45 46 for { 47 select { 48 49 // exit the observer 50 case <-o.ctx.Done(): 51 return 52 53 // time to execute the check 54 case <-timer.C: 55 query := checks.GetCheckQuery(o.check) 56 result := o.checker.Do(o.ctx, o.qc, query) 57 58 // and put the results into the store (already logged) 59 _ = o.checkStore.Set(o.allocID, result) 60 61 // setup timer for next interval 62 timer.Reset(o.check.Interval) 63 } 64 } 65 } 66 67 // stop checking our check - this will also interrupt an in-progress execution 68 func (o *observer) stop() { 69 o.cancel() 70 } 71 72 // checksHook manages checks of Nomad service registrations, at both the group and 73 // task level, by storing / removing them from the Client state store. 74 // 75 // Does not manage Consul service checks; see groupServiceHook instead. 76 type checksHook struct { 77 logger hclog.Logger 78 network structs.NetworkStatus 79 shim checkstore.Shim 80 checker checks.Checker 81 allocID string 82 83 // fields that get re-initialized on allocation update 84 lock sync.RWMutex 85 ctx context.Context 86 stop func() 87 observers observers 88 alloc *structs.Allocation 89 } 90 91 func newChecksHook( 92 logger hclog.Logger, 93 alloc *structs.Allocation, 94 shim checkstore.Shim, 95 network structs.NetworkStatus, 96 ) *checksHook { 97 h := &checksHook{ 98 logger: logger.Named(checksHookName), 99 allocID: alloc.ID, 100 alloc: alloc, 101 shim: shim, 102 network: network, 103 checker: checks.New(logger), 104 } 105 h.initialize(alloc) 106 return h 107 } 108 109 // initialize the dynamic fields of checksHook, which is to say setup all the 110 // observers and query context things associated with the alloc. 111 // 112 // Should be called during initial setup only. 113 func (h *checksHook) initialize(alloc *structs.Allocation) { 114 h.lock.Lock() 115 defer h.lock.Unlock() 116 117 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 118 if tg == nil { 119 return 120 } 121 122 // fresh context and stop function for this allocation 123 h.ctx, h.stop = context.WithCancel(context.Background()) 124 125 // fresh set of observers 126 h.observers = make(observers) 127 128 // set the initial alloc 129 h.alloc = alloc 130 } 131 132 // observe will create the observer for each service in services. 133 // services must use only nomad service provider. 134 // 135 // Caller must hold h.lock. 136 func (h *checksHook) observe(alloc *structs.Allocation, services []*structs.Service) { 137 var ports structs.AllocatedPorts 138 var networks structs.Networks 139 if alloc.AllocatedResources != nil { 140 ports = alloc.AllocatedResources.Shared.Ports 141 networks = alloc.AllocatedResources.Shared.Networks 142 } 143 144 for _, service := range services { 145 for _, check := range service.Checks { 146 147 // remember the initialization time 148 now := time.Now().UTC().Unix() 149 150 // create the deterministic check id for this check 151 id := structs.NomadCheckID(alloc.ID, alloc.TaskGroup, check) 152 153 // an observer for this check already exists 154 if _, exists := h.observers[id]; exists { 155 continue 156 } 157 158 ctx, cancel := context.WithCancel(h.ctx) 159 160 // create the observer for this check 161 h.observers[id] = &observer{ 162 ctx: ctx, 163 cancel: cancel, 164 check: check.Copy(), 165 checkStore: h.shim, 166 checker: h.checker, 167 allocID: h.allocID, 168 qc: &checks.QueryContext{ 169 ID: id, 170 CustomAddress: service.Address, 171 ServicePortLabel: service.PortLabel, 172 Ports: ports, 173 Networks: networks, 174 NetworkStatus: h.network, 175 Group: alloc.Name, 176 Task: service.TaskName, 177 Service: service.Name, 178 Check: check.Name, 179 }, 180 } 181 182 // insert a pending result into state store for each check 183 result := checks.Stub(id, structs.GetCheckMode(check), now, alloc.Name, service.TaskName, service.Name, check.Name) 184 if err := h.shim.Set(h.allocID, result); err != nil { 185 h.logger.Error("failed to set initial check status", "id", h.allocID, "error", err) 186 continue 187 } 188 189 // start the observer 190 go h.observers[id].start() 191 } 192 } 193 } 194 195 func (h *checksHook) Name() string { 196 return checksHookName 197 } 198 199 func (h *checksHook) Prerun() error { 200 h.lock.Lock() 201 defer h.lock.Unlock() 202 203 group := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup) 204 if group == nil { 205 return nil 206 } 207 208 // create and start observers of nomad service checks in alloc 209 h.observe(h.alloc, group.NomadServices()) 210 211 return nil 212 } 213 214 func (h *checksHook) Update(request *interfaces.RunnerUpdateRequest) error { 215 h.lock.Lock() 216 defer h.lock.Unlock() 217 218 group := request.Alloc.Job.LookupTaskGroup(request.Alloc.TaskGroup) 219 if group == nil { 220 return nil 221 } 222 223 // get all group and task level services using nomad provider 224 services := group.NomadServices() 225 226 // create a set of the updated set of checks 227 next := make([]structs.CheckID, 0, len(h.observers)) 228 for _, service := range services { 229 for _, check := range service.Checks { 230 next = append(next, structs.NomadCheckID( 231 request.Alloc.ID, 232 request.Alloc.TaskGroup, 233 check, 234 )) 235 } 236 } 237 238 // stop the observers of the checks we are removing 239 remove := h.shim.Difference(request.Alloc.ID, next) 240 for _, id := range remove { 241 h.observers[id].stop() 242 delete(h.observers, id) 243 } 244 245 // remove checks that are no longer part of the allocation 246 if err := h.shim.Remove(request.Alloc.ID, remove); err != nil { 247 return err 248 } 249 250 // remember this new alloc 251 h.alloc = request.Alloc 252 253 // ensure we are observing new checks (idempotent) 254 h.observe(request.Alloc, services) 255 256 return nil 257 } 258 259 func (h *checksHook) PreKill() { 260 h.lock.Lock() 261 defer h.lock.Unlock() 262 263 // terminate our hook context, which threads down to all observers 264 h.stop() 265 266 // purge all checks for this allocation from the client state store 267 if err := h.shim.Purge(h.allocID); err != nil { 268 h.logger.Error("failed to purge check results", "alloc_id", h.allocID, "error", err) 269 } 270 }