github.com/bigcommerce/nomad@v0.9.3-bc/drivers/exec/driver.go (about) 1 package exec 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "path/filepath" 8 "runtime" 9 "sync" 10 "time" 11 12 "github.com/hashicorp/consul-template/signals" 13 hclog "github.com/hashicorp/go-hclog" 14 "github.com/hashicorp/nomad/client/fingerprint" 15 "github.com/hashicorp/nomad/drivers/shared/eventer" 16 "github.com/hashicorp/nomad/drivers/shared/executor" 17 "github.com/hashicorp/nomad/helper" 18 "github.com/hashicorp/nomad/helper/pluginutils/loader" 19 "github.com/hashicorp/nomad/plugins/base" 20 "github.com/hashicorp/nomad/plugins/drivers" 21 "github.com/hashicorp/nomad/plugins/drivers/utils" 22 "github.com/hashicorp/nomad/plugins/shared/hclspec" 23 pstructs "github.com/hashicorp/nomad/plugins/shared/structs" 24 ) 25 26 const ( 27 // pluginName is the name of the plugin 28 pluginName = "exec" 29 30 // fingerprintPeriod is the interval at which the driver will send fingerprint responses 31 fingerprintPeriod = 30 * time.Second 32 33 // taskHandleVersion is the version of task handle which this driver sets 34 // and understands how to decode driver state 35 taskHandleVersion = 1 36 ) 37 38 var ( 39 // PluginID is the exec plugin metadata registered in the plugin 40 // catalog. 41 PluginID = loader.PluginID{ 42 Name: pluginName, 43 PluginType: base.PluginTypeDriver, 44 } 45 46 // PluginConfig is the exec driver factory function registered in the 47 // plugin catalog. 48 PluginConfig = &loader.InternalPluginConfig{ 49 Config: map[string]interface{}{}, 50 Factory: func(l hclog.Logger) interface{} { return NewExecDriver(l) }, 51 } 52 53 // pluginInfo is the response returned for the PluginInfo RPC 54 pluginInfo = &base.PluginInfoResponse{ 55 Type: base.PluginTypeDriver, 56 PluginApiVersions: []string{drivers.ApiVersion010}, 57 PluginVersion: "0.1.0", 58 Name: pluginName, 59 } 60 61 // configSpec is the hcl specification returned by the ConfigSchema RPC 62 configSpec = hclspec.NewObject(map[string]*hclspec.Spec{}) 63 64 // taskConfigSpec is the hcl specification for the driver config section of 65 // a task within a job. It is returned in the TaskConfigSchema RPC 66 taskConfigSpec = hclspec.NewObject(map[string]*hclspec.Spec{ 67 "command": hclspec.NewAttr("command", "string", true), 68 "args": hclspec.NewAttr("args", "list(string)", false), 69 }) 70 71 // capabilities is returned by the Capabilities RPC and indicates what 72 // optional features this driver supports 73 capabilities = &drivers.Capabilities{ 74 SendSignals: true, 75 Exec: true, 76 FSIsolation: drivers.FSIsolationChroot, 77 } 78 ) 79 80 // Driver fork/execs tasks using many of the underlying OS's isolation 81 // features where configured. 82 type Driver struct { 83 // eventer is used to handle multiplexing of TaskEvents calls such that an 84 // event can be broadcast to all callers 85 eventer *eventer.Eventer 86 87 // nomadConfig is the client config from nomad 88 nomadConfig *base.ClientDriverConfig 89 90 // tasks is the in memory datastore mapping taskIDs to driverHandles 91 tasks *taskStore 92 93 // ctx is the context for the driver. It is passed to other subsystems to 94 // coordinate shutdown 95 ctx context.Context 96 97 // signalShutdown is called when the driver is shutting down and cancels the 98 // ctx passed to any subsystems 99 signalShutdown context.CancelFunc 100 101 // logger will log to the Nomad agent 102 logger hclog.Logger 103 104 // A tri-state boolean to know if the fingerprinting has happened and 105 // whether it has been successful 106 fingerprintSuccess *bool 107 fingerprintLock sync.Mutex 108 } 109 110 // TaskConfig is the driver configuration of a task within a job 111 type TaskConfig struct { 112 Command string `codec:"command"` 113 Args []string `codec:"args"` 114 } 115 116 // TaskState is the state which is encoded in the handle returned in 117 // StartTask. This information is needed to rebuild the task state and handler 118 // during recovery. 119 type TaskState struct { 120 ReattachConfig *pstructs.ReattachConfig 121 TaskConfig *drivers.TaskConfig 122 Pid int 123 StartedAt time.Time 124 } 125 126 // NewExecDriver returns a new DrivePlugin implementation 127 func NewExecDriver(logger hclog.Logger) drivers.DriverPlugin { 128 ctx, cancel := context.WithCancel(context.Background()) 129 logger = logger.Named(pluginName) 130 return &Driver{ 131 eventer: eventer.NewEventer(ctx, logger), 132 tasks: newTaskStore(), 133 ctx: ctx, 134 signalShutdown: cancel, 135 logger: logger, 136 } 137 } 138 139 // setFingerprintSuccess marks the driver as having fingerprinted successfully 140 func (d *Driver) setFingerprintSuccess() { 141 d.fingerprintLock.Lock() 142 d.fingerprintSuccess = helper.BoolToPtr(true) 143 d.fingerprintLock.Unlock() 144 } 145 146 // setFingerprintFailure marks the driver as having failed fingerprinting 147 func (d *Driver) setFingerprintFailure() { 148 d.fingerprintLock.Lock() 149 d.fingerprintSuccess = helper.BoolToPtr(false) 150 d.fingerprintLock.Unlock() 151 } 152 153 // fingerprintSuccessful returns true if the driver has 154 // never fingerprinted or has successfully fingerprinted 155 func (d *Driver) fingerprintSuccessful() bool { 156 d.fingerprintLock.Lock() 157 defer d.fingerprintLock.Unlock() 158 return d.fingerprintSuccess == nil || *d.fingerprintSuccess 159 } 160 161 func (d *Driver) PluginInfo() (*base.PluginInfoResponse, error) { 162 return pluginInfo, nil 163 } 164 165 func (d *Driver) ConfigSchema() (*hclspec.Spec, error) { 166 return configSpec, nil 167 } 168 169 func (d *Driver) SetConfig(cfg *base.Config) error { 170 if cfg != nil && cfg.AgentConfig != nil { 171 d.nomadConfig = cfg.AgentConfig.Driver 172 } 173 return nil 174 } 175 176 func (d *Driver) Shutdown() { 177 d.signalShutdown() 178 } 179 180 func (d *Driver) TaskConfigSchema() (*hclspec.Spec, error) { 181 return taskConfigSpec, nil 182 } 183 184 func (d *Driver) Capabilities() (*drivers.Capabilities, error) { 185 return capabilities, nil 186 } 187 188 func (d *Driver) Fingerprint(ctx context.Context) (<-chan *drivers.Fingerprint, error) { 189 ch := make(chan *drivers.Fingerprint) 190 go d.handleFingerprint(ctx, ch) 191 return ch, nil 192 193 } 194 func (d *Driver) handleFingerprint(ctx context.Context, ch chan<- *drivers.Fingerprint) { 195 defer close(ch) 196 ticker := time.NewTimer(0) 197 for { 198 select { 199 case <-ctx.Done(): 200 return 201 case <-d.ctx.Done(): 202 return 203 case <-ticker.C: 204 ticker.Reset(fingerprintPeriod) 205 ch <- d.buildFingerprint() 206 } 207 } 208 } 209 210 func (d *Driver) buildFingerprint() *drivers.Fingerprint { 211 if runtime.GOOS != "linux" { 212 d.setFingerprintFailure() 213 return &drivers.Fingerprint{ 214 Health: drivers.HealthStateUndetected, 215 HealthDescription: "exec driver unsupported on client OS", 216 } 217 } 218 219 fp := &drivers.Fingerprint{ 220 Attributes: map[string]*pstructs.Attribute{}, 221 Health: drivers.HealthStateHealthy, 222 HealthDescription: drivers.DriverHealthy, 223 } 224 225 if !utils.IsUnixRoot() { 226 fp.Health = drivers.HealthStateUndetected 227 fp.HealthDescription = drivers.DriverRequiresRootMessage 228 d.setFingerprintFailure() 229 return fp 230 } 231 232 mount, err := fingerprint.FindCgroupMountpointDir() 233 if err != nil { 234 fp.Health = drivers.HealthStateUnhealthy 235 fp.HealthDescription = drivers.NoCgroupMountMessage 236 if d.fingerprintSuccessful() { 237 d.logger.Warn(fp.HealthDescription, "error", err) 238 } 239 d.setFingerprintFailure() 240 return fp 241 } 242 243 if mount == "" { 244 fp.Health = drivers.HealthStateUnhealthy 245 fp.HealthDescription = drivers.CgroupMountEmpty 246 d.setFingerprintFailure() 247 return fp 248 } 249 250 fp.Attributes["driver.exec"] = pstructs.NewBoolAttribute(true) 251 d.setFingerprintSuccess() 252 return fp 253 } 254 255 func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error { 256 if handle == nil { 257 return fmt.Errorf("handle cannot be nil") 258 } 259 260 // COMPAT(0.10): pre 0.9 upgrade path check 261 if handle.Version == 0 { 262 return d.recoverPre09Task(handle) 263 } 264 265 // If already attached to handle there's nothing to recover. 266 if _, ok := d.tasks.Get(handle.Config.ID); ok { 267 d.logger.Trace("nothing to recover; task already exists", 268 "task_id", handle.Config.ID, 269 "task_name", handle.Config.Name, 270 ) 271 return nil 272 } 273 274 // Handle doesn't already exist, try to reattach 275 var taskState TaskState 276 if err := handle.GetDriverState(&taskState); err != nil { 277 d.logger.Error("failed to decode task state from handle", "error", err, "task_id", handle.Config.ID) 278 return fmt.Errorf("failed to decode task state from handle: %v", err) 279 } 280 281 // Create client for reattached executor 282 plugRC, err := pstructs.ReattachConfigToGoPlugin(taskState.ReattachConfig) 283 if err != nil { 284 d.logger.Error("failed to build ReattachConfig from task state", "error", err, "task_id", handle.Config.ID) 285 return fmt.Errorf("failed to build ReattachConfig from task state: %v", err) 286 } 287 288 exec, pluginClient, err := executor.ReattachToExecutor(plugRC, 289 d.logger.With("task_name", handle.Config.Name, "alloc_id", handle.Config.AllocID)) 290 if err != nil { 291 d.logger.Error("failed to reattach to executor", "error", err, "task_id", handle.Config.ID) 292 return fmt.Errorf("failed to reattach to executor: %v", err) 293 } 294 295 h := &taskHandle{ 296 exec: exec, 297 pid: taskState.Pid, 298 pluginClient: pluginClient, 299 taskConfig: taskState.TaskConfig, 300 procState: drivers.TaskStateRunning, 301 startedAt: taskState.StartedAt, 302 exitResult: &drivers.ExitResult{}, 303 } 304 305 d.tasks.Set(taskState.TaskConfig.ID, h) 306 307 go h.run() 308 return nil 309 } 310 311 func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drivers.DriverNetwork, error) { 312 if _, ok := d.tasks.Get(cfg.ID); ok { 313 return nil, nil, fmt.Errorf("task with ID %q already started", cfg.ID) 314 } 315 316 var driverConfig TaskConfig 317 if err := cfg.DecodeDriverConfig(&driverConfig); err != nil { 318 return nil, nil, fmt.Errorf("failed to decode driver config: %v", err) 319 } 320 321 d.logger.Info("starting task", "driver_cfg", hclog.Fmt("%+v", driverConfig)) 322 handle := drivers.NewTaskHandle(taskHandleVersion) 323 handle.Config = cfg 324 325 pluginLogFile := filepath.Join(cfg.TaskDir().Dir, "executor.out") 326 executorConfig := &executor.ExecutorConfig{ 327 LogFile: pluginLogFile, 328 LogLevel: "debug", 329 FSIsolation: true, 330 } 331 332 exec, pluginClient, err := executor.CreateExecutor( 333 d.logger.With("task_name", handle.Config.Name, "alloc_id", handle.Config.AllocID), 334 d.nomadConfig, executorConfig) 335 if err != nil { 336 return nil, nil, fmt.Errorf("failed to create executor: %v", err) 337 } 338 339 user := cfg.User 340 if user == "" { 341 user = "nobody" 342 } 343 344 execCmd := &executor.ExecCommand{ 345 Cmd: driverConfig.Command, 346 Args: driverConfig.Args, 347 Env: cfg.EnvList(), 348 User: user, 349 ResourceLimits: true, 350 Resources: cfg.Resources, 351 TaskDir: cfg.TaskDir().Dir, 352 StdoutPath: cfg.StdoutPath, 353 StderrPath: cfg.StderrPath, 354 Mounts: cfg.Mounts, 355 Devices: cfg.Devices, 356 } 357 358 ps, err := exec.Launch(execCmd) 359 if err != nil { 360 pluginClient.Kill() 361 return nil, nil, fmt.Errorf("failed to launch command with executor: %v", err) 362 } 363 364 h := &taskHandle{ 365 exec: exec, 366 pid: ps.Pid, 367 pluginClient: pluginClient, 368 taskConfig: cfg, 369 procState: drivers.TaskStateRunning, 370 startedAt: time.Now().Round(time.Millisecond), 371 logger: d.logger, 372 } 373 374 driverState := TaskState{ 375 ReattachConfig: pstructs.ReattachConfigFromGoPlugin(pluginClient.ReattachConfig()), 376 Pid: ps.Pid, 377 TaskConfig: cfg, 378 StartedAt: h.startedAt, 379 } 380 381 if err := handle.SetDriverState(&driverState); err != nil { 382 d.logger.Error("failed to start task, error setting driver state", "error", err) 383 exec.Shutdown("", 0) 384 pluginClient.Kill() 385 return nil, nil, fmt.Errorf("failed to set driver state: %v", err) 386 } 387 388 d.tasks.Set(cfg.ID, h) 389 go h.run() 390 return handle, nil, nil 391 } 392 393 func (d *Driver) WaitTask(ctx context.Context, taskID string) (<-chan *drivers.ExitResult, error) { 394 handle, ok := d.tasks.Get(taskID) 395 if !ok { 396 return nil, drivers.ErrTaskNotFound 397 } 398 399 ch := make(chan *drivers.ExitResult) 400 go d.handleWait(ctx, handle, ch) 401 402 return ch, nil 403 } 404 405 func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *drivers.ExitResult) { 406 defer close(ch) 407 var result *drivers.ExitResult 408 ps, err := handle.exec.Wait(ctx) 409 if err != nil { 410 result = &drivers.ExitResult{ 411 Err: fmt.Errorf("executor: error waiting on process: %v", err), 412 } 413 } else { 414 result = &drivers.ExitResult{ 415 ExitCode: ps.ExitCode, 416 Signal: ps.Signal, 417 } 418 } 419 420 select { 421 case <-ctx.Done(): 422 return 423 case <-d.ctx.Done(): 424 return 425 case ch <- result: 426 } 427 } 428 429 func (d *Driver) StopTask(taskID string, timeout time.Duration, signal string) error { 430 handle, ok := d.tasks.Get(taskID) 431 if !ok { 432 return drivers.ErrTaskNotFound 433 } 434 435 if err := handle.exec.Shutdown(signal, timeout); err != nil { 436 if handle.pluginClient.Exited() { 437 return nil 438 } 439 return fmt.Errorf("executor Shutdown failed: %v", err) 440 } 441 442 return nil 443 } 444 445 func (d *Driver) DestroyTask(taskID string, force bool) error { 446 handle, ok := d.tasks.Get(taskID) 447 if !ok { 448 return drivers.ErrTaskNotFound 449 } 450 451 if handle.IsRunning() && !force { 452 return fmt.Errorf("cannot destroy running task") 453 } 454 455 if !handle.pluginClient.Exited() { 456 if handle.IsRunning() { 457 if err := handle.exec.Shutdown("", 0); err != nil { 458 handle.logger.Error("destroying executor failed", "err", err) 459 } 460 } 461 462 handle.pluginClient.Kill() 463 } 464 465 d.tasks.Delete(taskID) 466 return nil 467 } 468 469 func (d *Driver) InspectTask(taskID string) (*drivers.TaskStatus, error) { 470 handle, ok := d.tasks.Get(taskID) 471 if !ok { 472 return nil, drivers.ErrTaskNotFound 473 } 474 475 return handle.TaskStatus(), nil 476 } 477 478 func (d *Driver) TaskStats(ctx context.Context, taskID string, interval time.Duration) (<-chan *drivers.TaskResourceUsage, error) { 479 handle, ok := d.tasks.Get(taskID) 480 if !ok { 481 return nil, drivers.ErrTaskNotFound 482 } 483 484 return handle.exec.Stats(ctx, interval) 485 } 486 487 func (d *Driver) TaskEvents(ctx context.Context) (<-chan *drivers.TaskEvent, error) { 488 return d.eventer.TaskEvents(ctx) 489 } 490 491 func (d *Driver) SignalTask(taskID string, signal string) error { 492 handle, ok := d.tasks.Get(taskID) 493 if !ok { 494 return drivers.ErrTaskNotFound 495 } 496 497 sig := os.Interrupt 498 if s, ok := signals.SignalLookup[signal]; ok { 499 sig = s 500 } else { 501 d.logger.Warn("unknown signal to send to task, using SIGINT instead", "signal", signal, "task_id", handle.taskConfig.ID) 502 503 } 504 return handle.exec.Signal(sig) 505 } 506 507 func (d *Driver) ExecTask(taskID string, cmd []string, timeout time.Duration) (*drivers.ExecTaskResult, error) { 508 if len(cmd) == 0 { 509 return nil, fmt.Errorf("error cmd must have at least one value") 510 } 511 handle, ok := d.tasks.Get(taskID) 512 if !ok { 513 return nil, drivers.ErrTaskNotFound 514 } 515 516 args := []string{} 517 if len(cmd) > 1 { 518 args = cmd[1:] 519 } 520 521 out, exitCode, err := handle.exec.Exec(time.Now().Add(timeout), cmd[0], args) 522 if err != nil { 523 return nil, err 524 } 525 526 return &drivers.ExecTaskResult{ 527 Stdout: out, 528 ExitResult: &drivers.ExitResult{ 529 ExitCode: exitCode, 530 }, 531 }, nil 532 } 533 534 var _ drivers.ExecTaskStreamingRawDriver = (*Driver)(nil) 535 536 func (d *Driver) ExecTaskStreamingRaw(ctx context.Context, 537 taskID string, 538 command []string, 539 tty bool, 540 stream drivers.ExecTaskStream) error { 541 542 if len(command) == 0 { 543 return fmt.Errorf("error cmd must have at least one value") 544 } 545 handle, ok := d.tasks.Get(taskID) 546 if !ok { 547 return drivers.ErrTaskNotFound 548 } 549 550 return handle.exec.ExecStreaming(ctx, command, tty, stream) 551 }