github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/client/alloc_runner.go (about) 1 package client 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "log" 7 "os" 8 "path/filepath" 9 "sync" 10 "time" 11 12 "github.com/hashicorp/go-multierror" 13 "github.com/hashicorp/nomad/client/allocdir" 14 "github.com/hashicorp/nomad/client/config" 15 "github.com/hashicorp/nomad/client/driver" 16 "github.com/hashicorp/nomad/nomad/structs" 17 ) 18 19 const ( 20 // allocSyncRetryIntv is the interval on which we retry updating 21 // the status of the allocation 22 allocSyncRetryIntv = 15 * time.Second 23 ) 24 25 // taskStatus is used to track the status of a task 26 type taskStatus struct { 27 Status string 28 Description string 29 } 30 31 // AllocStateUpdater is used to update the status of an allocation 32 type AllocStateUpdater func(alloc *structs.Allocation) error 33 34 // AllocRunner is used to wrap an allocation and provide the execution context. 35 type AllocRunner struct { 36 config *config.Config 37 updater AllocStateUpdater 38 logger *log.Logger 39 40 alloc *structs.Allocation 41 42 dirtyCh chan struct{} 43 44 ctx *driver.ExecContext 45 tasks map[string]*TaskRunner 46 taskLock sync.RWMutex 47 48 taskStatus map[string]taskStatus 49 taskStatusLock sync.RWMutex 50 51 updateCh chan *structs.Allocation 52 53 destroy bool 54 destroyCh chan struct{} 55 destroyLock sync.Mutex 56 waitCh chan struct{} 57 } 58 59 // allocRunnerState is used to snapshot the state of the alloc runner 60 type allocRunnerState struct { 61 Alloc *structs.Allocation 62 TaskStatus map[string]taskStatus 63 Context *driver.ExecContext 64 } 65 66 // NewAllocRunner is used to create a new allocation context 67 func NewAllocRunner(logger *log.Logger, config *config.Config, updater AllocStateUpdater, alloc *structs.Allocation) *AllocRunner { 68 ar := &AllocRunner{ 69 config: config, 70 updater: updater, 71 logger: logger, 72 alloc: alloc, 73 dirtyCh: make(chan struct{}, 1), 74 tasks: make(map[string]*TaskRunner), 75 taskStatus: make(map[string]taskStatus), 76 updateCh: make(chan *structs.Allocation, 8), 77 destroyCh: make(chan struct{}), 78 waitCh: make(chan struct{}), 79 } 80 return ar 81 } 82 83 // stateFilePath returns the path to our state file 84 func (r *AllocRunner) stateFilePath() string { 85 return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, "state.json") 86 } 87 88 // RestoreState is used to restore the state of the alloc runner 89 func (r *AllocRunner) RestoreState() error { 90 // Load the snapshot 91 var snap allocRunnerState 92 if err := restoreState(r.stateFilePath(), &snap); err != nil { 93 return err 94 } 95 96 // Restore fields 97 r.alloc = snap.Alloc 98 r.taskStatus = snap.TaskStatus 99 r.ctx = snap.Context 100 101 // Restore the task runners 102 var mErr multierror.Error 103 for name := range r.taskStatus { 104 task := &structs.Task{Name: name} 105 tr := NewTaskRunner(r.logger, r.config, r.setTaskStatus, r.ctx, r.alloc.ID, task) 106 r.tasks[name] = tr 107 if err := tr.RestoreState(); err != nil { 108 r.logger.Printf("[ERR] client: failed to restore state for alloc %s task '%s': %v", r.alloc.ID, name, err) 109 mErr.Errors = append(mErr.Errors, err) 110 } else { 111 go tr.Run() 112 } 113 } 114 return mErr.ErrorOrNil() 115 } 116 117 // SaveState is used to snapshot our state 118 func (r *AllocRunner) SaveState() error { 119 r.taskStatusLock.RLock() 120 snap := allocRunnerState{ 121 Alloc: r.alloc, 122 TaskStatus: r.taskStatus, 123 Context: r.ctx, 124 } 125 err := persistState(r.stateFilePath(), &snap) 126 r.taskStatusLock.RUnlock() 127 if err != nil { 128 return err 129 } 130 131 // Save state for each task 132 r.taskLock.RLock() 133 defer r.taskLock.RUnlock() 134 var mErr multierror.Error 135 for name, tr := range r.tasks { 136 if err := tr.SaveState(); err != nil { 137 r.logger.Printf("[ERR] client: failed to save state for alloc %s task '%s': %v", 138 r.alloc.ID, name, err) 139 mErr.Errors = append(mErr.Errors, err) 140 } 141 } 142 return mErr.ErrorOrNil() 143 } 144 145 // DestroyState is used to cleanup after ourselves 146 func (r *AllocRunner) DestroyState() error { 147 return os.RemoveAll(filepath.Dir(r.stateFilePath())) 148 } 149 150 // DestroyContext is used to destroy the context 151 func (r *AllocRunner) DestroyContext() error { 152 return r.ctx.AllocDir.Destroy() 153 } 154 155 // Alloc returns the associated allocation 156 func (r *AllocRunner) Alloc() *structs.Allocation { 157 return r.alloc 158 } 159 160 // setAlloc is used to update the allocation of the runner 161 // we preserve the existing client status and description 162 func (r *AllocRunner) setAlloc(alloc *structs.Allocation) { 163 if r.alloc != nil { 164 alloc.ClientStatus = r.alloc.ClientStatus 165 alloc.ClientDescription = r.alloc.ClientDescription 166 } 167 r.alloc = alloc 168 } 169 170 // dirtySyncState is used to watch for state being marked dirty to sync 171 func (r *AllocRunner) dirtySyncState() { 172 for { 173 select { 174 case <-r.dirtyCh: 175 r.retrySyncState(r.destroyCh) 176 case <-r.destroyCh: 177 return 178 } 179 } 180 } 181 182 // retrySyncState is used to retry the state sync until success 183 func (r *AllocRunner) retrySyncState(stopCh chan struct{}) { 184 for { 185 err := r.syncStatus() 186 if err == nil { 187 return 188 } 189 select { 190 case <-time.After(allocSyncRetryIntv + randomStagger(allocSyncRetryIntv)): 191 case <-stopCh: 192 return 193 } 194 } 195 } 196 197 // syncStatus is used to run and sync the status when it changes 198 func (r *AllocRunner) syncStatus() error { 199 // Scan the task status to termine the status of the alloc 200 var pending, running, dead, failed bool 201 r.taskStatusLock.RLock() 202 pending = len(r.taskStatus) < len(r.tasks) 203 for _, status := range r.taskStatus { 204 switch status.Status { 205 case structs.AllocClientStatusRunning: 206 running = true 207 case structs.AllocClientStatusDead: 208 dead = true 209 case structs.AllocClientStatusFailed: 210 failed = true 211 } 212 } 213 if len(r.taskStatus) > 0 { 214 taskDesc, _ := json.Marshal(r.taskStatus) 215 r.alloc.ClientDescription = string(taskDesc) 216 } 217 r.taskStatusLock.RUnlock() 218 219 // Determine the alloc status 220 if failed { 221 r.alloc.ClientStatus = structs.AllocClientStatusFailed 222 } else if running { 223 r.alloc.ClientStatus = structs.AllocClientStatusRunning 224 } else if dead && !pending { 225 r.alloc.ClientStatus = structs.AllocClientStatusDead 226 } 227 228 // Attempt to update the status 229 if err := r.updater(r.alloc); err != nil { 230 r.logger.Printf("[ERR] client: failed to update alloc '%s' status to %s: %s", 231 r.alloc.ID, r.alloc.ClientStatus, err) 232 return err 233 } 234 return nil 235 } 236 237 // setStatus is used to update the allocation status 238 func (r *AllocRunner) setStatus(status, desc string) { 239 r.alloc.ClientStatus = status 240 r.alloc.ClientDescription = desc 241 select { 242 case r.dirtyCh <- struct{}{}: 243 default: 244 } 245 } 246 247 // setTaskStatus is used to set the status of a task 248 func (r *AllocRunner) setTaskStatus(taskName, status, desc string) { 249 r.taskStatusLock.Lock() 250 r.taskStatus[taskName] = taskStatus{ 251 Status: status, 252 Description: desc, 253 } 254 r.taskStatusLock.Unlock() 255 select { 256 case r.dirtyCh <- struct{}{}: 257 default: 258 } 259 } 260 261 // Run is a long running goroutine used to manage an allocation 262 func (r *AllocRunner) Run() { 263 defer close(r.waitCh) 264 go r.dirtySyncState() 265 266 // Check if the allocation is in a terminal status 267 alloc := r.alloc 268 if alloc.TerminalStatus() { 269 r.logger.Printf("[DEBUG] client: aborting runner for alloc '%s', terminal status", r.alloc.ID) 270 return 271 } 272 r.logger.Printf("[DEBUG] client: starting runner for alloc '%s'", r.alloc.ID) 273 274 // Find the task group to run in the allocation 275 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 276 if tg == nil { 277 r.logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup) 278 r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("missing task group '%s'", alloc.TaskGroup)) 279 return 280 } 281 282 // Create the execution context 283 if r.ctx == nil { 284 allocDir := allocdir.NewAllocDir(filepath.Join(r.config.AllocDir, r.alloc.ID)) 285 if err := allocDir.Build(tg.Tasks); err != nil { 286 r.logger.Printf("[WARN] client: failed to build task directories: %v", err) 287 r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("failed to build task dirs for '%s'", alloc.TaskGroup)) 288 return 289 } 290 r.ctx = driver.NewExecContext(allocDir) 291 } 292 293 // Start the task runners 294 r.taskLock.Lock() 295 for _, task := range tg.Tasks { 296 // Skip tasks that were restored 297 if _, ok := r.tasks[task.Name]; ok { 298 continue 299 } 300 301 // Merge in the task resources 302 task.Resources = alloc.TaskResources[task.Name] 303 304 tr := NewTaskRunner(r.logger, r.config, r.setTaskStatus, r.ctx, r.alloc.ID, task) 305 r.tasks[task.Name] = tr 306 go tr.Run() 307 } 308 r.taskLock.Unlock() 309 310 OUTER: 311 // Wait for updates 312 for { 313 select { 314 case update := <-r.updateCh: 315 // Check if we're in a terminal status 316 if update.TerminalStatus() { 317 r.setAlloc(update) 318 break OUTER 319 } 320 321 // Update the task groups 322 r.taskLock.RLock() 323 for _, task := range tg.Tasks { 324 tr := r.tasks[task.Name] 325 326 // Merge in the task resources 327 task.Resources = update.TaskResources[task.Name] 328 tr.Update(task) 329 } 330 r.taskLock.RUnlock() 331 332 case <-r.destroyCh: 333 break OUTER 334 } 335 } 336 337 // Destroy each sub-task 338 r.taskLock.RLock() 339 defer r.taskLock.RUnlock() 340 for _, tr := range r.tasks { 341 tr.Destroy() 342 } 343 344 // Wait for termination of the task runners 345 for _, tr := range r.tasks { 346 <-tr.WaitCh() 347 } 348 349 // Final state sync 350 r.retrySyncState(nil) 351 352 // Check if we should destroy our state 353 if r.destroy { 354 if err := r.DestroyContext(); err != nil { 355 r.logger.Printf("[ERR] client: failed to destroy context for alloc '%s': %v", 356 r.alloc.ID, err) 357 } 358 if err := r.DestroyState(); err != nil { 359 r.logger.Printf("[ERR] client: failed to destroy state for alloc '%s': %v", 360 r.alloc.ID, err) 361 } 362 } 363 r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID) 364 } 365 366 // Update is used to update the allocation of the context 367 func (r *AllocRunner) Update(update *structs.Allocation) { 368 select { 369 case r.updateCh <- update: 370 default: 371 r.logger.Printf("[ERR] client: dropping update to alloc '%s'", update.ID) 372 } 373 } 374 375 // Destroy is used to indicate that the allocation context should be destroyed 376 func (r *AllocRunner) Destroy() { 377 r.destroyLock.Lock() 378 defer r.destroyLock.Unlock() 379 380 if r.destroy { 381 return 382 } 383 r.destroy = true 384 close(r.destroyCh) 385 } 386 387 // WaitCh returns a channel to wait for termination 388 func (r *AllocRunner) WaitCh() <-chan struct{} { 389 return r.waitCh 390 }