github.com/wtsi-hgi/go-softpack-builder@v1.8.1/wr/wr.go (about)

     1  /*******************************************************************************
     2   * Copyright (c) 2023, 2024 Genome Research Ltd.
     3   *
     4   * Permission is hereby granted, free of charge, to any person obtaining
     5   * a copy of this software and associated documentation files (the
     6   * "Software"), to deal in the Software without restriction, including
     7   * without limitation the rights to use, copy, modify, merge, publish,
     8   * distribute, sublicense, and/or sell copies of the Software, and to
     9   * permit persons to whom the Software is furnished to do so, subject to
    10   * the following conditions:
    11   *
    12   * The above copyright notice and this permission notice shall be included
    13   * in all copies or substantial portions of the Software.
    14   *
    15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    16   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    17   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
    18   * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
    19   * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    20   * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
    21   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    22   ******************************************************************************/
    23  
    24  package wr
    25  
    26  import (
    27  	"bufio"
    28  	"bytes"
    29  	_ "embed"
    30  	"log/slog"
    31  	"os/exec"
    32  	"strings"
    33  	"text/template"
    34  	"time"
    35  )
    36  
    37  type WRJobStatus int
    38  
    39  const (
    40  	WRJobStatusInvalid WRJobStatus = iota
    41  	WRJobStatusDelayed
    42  	WRJobStatusReady
    43  	WRJobStatusReserved
    44  	WRJobStatusRunning
    45  	WRJobStatusLost
    46  	WRJobStatusBuried
    47  	WRJobStatusComplete
    48  )
    49  
    50  const (
    51  	plainStatusCols     = 2
    52  	defaultPollDuration = 5 * time.Second
    53  )
    54  
    55  type Error struct {
    56  	msg string
    57  }
    58  
    59  func (e Error) Error() string { return "wr cmd failed: " + e.msg }
    60  
    61  //go:embed wr.tmpl
    62  var wrTmplStr string
    63  var wrTmpl *template.Template //nolint:gochecknoglobals
    64  
    65  func init() { //nolint:gochecknoinits
    66  	wrTmpl = template.Must(template.New("").Parse(wrTmplStr))
    67  }
    68  
    69  // SingularityBuildInS3WRInput returns wr input that could be piped to `wr add`
    70  // and that would run a singularity build where the working directory is a fuse
    71  // mount of the given s3Path.
    72  func SingularityBuildInS3WRInput(s3Path, hash string) (string, error) {
    73  	var w strings.Builder
    74  
    75  	if err := wrTmpl.Execute(&w, struct {
    76  		S3Path, Hash string
    77  	}{
    78  		s3Path,
    79  		hash,
    80  	}); err != nil {
    81  		return "", err
    82  	}
    83  
    84  	return w.String(), nil
    85  }
    86  
    87  // Runner lets you Run() a wr add command.
    88  type Runner struct {
    89  	deployment   string
    90  	memory       string
    91  	pollDuration time.Duration
    92  }
    93  
    94  // New returns a Runner that will use the given wr deployment to wr add jobs
    95  // during Run().
    96  func New(deployment string) *Runner {
    97  	return &Runner{
    98  		deployment:   deployment,
    99  		memory:       "43G",
   100  		pollDuration: defaultPollDuration,
   101  	}
   102  }
   103  
   104  // Run pipes the given wrInput (eg. as produced by
   105  // SingularityBuildInS3WRInput()) to `wr add`, which adds a job to wr's queue
   106  // and returns its ID. You should call Wait(ID) to actually wait for the job to
   107  // finishing running.
   108  //
   109  // The memory defaults to 8GB, time to 8hrs, and if the cmd in the input has
   110  // previously been run, the cmd will be re-run.
   111  //
   112  // NB: if the cmd is a duplicate of a currently queued job, this will not
   113  // generate an error, but just return the id of the existing job.
   114  func (r *Runner) Add(wrInput string) (string, error) {
   115  	cmd := exec.Command("wr", "add", "--deployment", r.deployment, "--simple", //nolint:gosec
   116  		"--time", "8h", "--memory", r.memory, "-o", "2", "--rerun")
   117  	cmd.Stdin = strings.NewReader(wrInput)
   118  
   119  	return r.runWRCmd(cmd)
   120  }
   121  
   122  func (r *Runner) runWRCmd(cmd *exec.Cmd) (string, error) {
   123  	var stdout, stderr bytes.Buffer
   124  
   125  	cmd.Stdout = &stdout
   126  	cmd.Stderr = &stderr
   127  
   128  	err := cmd.Run()
   129  	slog.Debug("ran wr command", "cmd", cmd.String(), "stdout", stdout.String(),
   130  		"stderr", stderr.String(), "err", err, "exitcode", cmd.ProcessState.ExitCode())
   131  
   132  	if err != nil {
   133  		errStr := stderr.String()
   134  		if !strings.Contains(errStr, "EROR") {
   135  			return strings.TrimSpace(stdout.String()), nil
   136  		}
   137  
   138  		if errStr == "" {
   139  			errStr = err.Error()
   140  		}
   141  
   142  		return "", Error{msg: errStr}
   143  	}
   144  
   145  	return strings.TrimSpace(stdout.String()), nil
   146  }
   147  
   148  // WaitForRunning waits until the given wr job either starts running, or exits.
   149  func (r *Runner) WaitForRunning(id string) error {
   150  	var err error
   151  
   152  	cb := func(status WRJobStatus, cbErr error) bool {
   153  		err = cbErr
   154  
   155  		return err != nil || statusIsStarted(status) || statusIsExited(status)
   156  	}
   157  
   158  	r.pollStatus(id, cb)
   159  
   160  	return err
   161  }
   162  
   163  func statusIsStarted(status WRJobStatus) bool {
   164  	return status == WRJobStatusRunning || status == WRJobStatusLost
   165  }
   166  
   167  func statusIsExited(status WRJobStatus) bool {
   168  	return status == WRJobStatusInvalid || status == WRJobStatusBuried || status == WRJobStatusComplete
   169  }
   170  
   171  // pollStatusCallback receives a WRJobStatus and error, and should return true
   172  // if you want to stop polling now.
   173  type pollStatusCallback = func(WRJobStatus, error) bool
   174  
   175  func (r *Runner) pollStatus(id string, cb pollStatusCallback) {
   176  	ticker := time.NewTicker(r.pollDuration)
   177  	defer ticker.Stop()
   178  
   179  	for range ticker.C {
   180  		if cb(r.Status(id)) {
   181  			return
   182  		}
   183  	}
   184  }
   185  
   186  // Wait waits for the given wr job to exit.
   187  func (r *Runner) Wait(id string) (WRJobStatus, error) {
   188  	var (
   189  		status WRJobStatus
   190  		err    error
   191  	)
   192  
   193  	cb := func(cbStatus WRJobStatus, cbErr error) bool {
   194  		status = cbStatus
   195  		err = cbErr
   196  
   197  		return err != nil || statusIsExited(status)
   198  	}
   199  
   200  	r.pollStatus(id, cb)
   201  
   202  	return status, err
   203  }
   204  
   205  // Status returns the status of the wr job with the given internal ID.
   206  //
   207  // Returns WRJobStatusInvalid if the ID wasn't found. Returns WRJobStatusBuried
   208  // if it failed. Only returns an error if there was a problem getting the
   209  // status.
   210  func (r *Runner) Status(id string) (WRJobStatus, error) {
   211  	cmd := exec.Command("wr", "status", "--deployment", r.deployment, "-o", //nolint:gosec
   212  		"plain", "-i", id, "-y")
   213  
   214  	out, err := r.runWRCmd(cmd)
   215  	if err != nil {
   216  		slog.Error("wr status command failed", "err", err)
   217  
   218  		return WRJobStatusInvalid, err
   219  	}
   220  
   221  	return parseWRStatus(out, id)
   222  }
   223  
   224  func parseWRStatus(wrStatusOutput, id string) (WRJobStatus, error) {
   225  	scanner := bufio.NewScanner(strings.NewReader(wrStatusOutput))
   226  	for scanner.Scan() {
   227  		cols := strings.Split(scanner.Text(), "\t")
   228  		if len(cols) != plainStatusCols {
   229  			continue
   230  		}
   231  
   232  		if cols[0] != id {
   233  			continue
   234  		}
   235  
   236  		return statusStringToType(cols[1]), nil
   237  	}
   238  
   239  	slog.Error("wr status parsing to find a job failed", "id", id, "err", scanner.Err())
   240  
   241  	return WRJobStatusInvalid, scanner.Err()
   242  }
   243  
   244  func statusStringToType(status string) WRJobStatus { //nolint:gocyclo
   245  	switch status {
   246  	case "delayed":
   247  		return WRJobStatusDelayed
   248  	case "ready":
   249  		return WRJobStatusReady
   250  	case "reserved":
   251  		return WRJobStatusReserved
   252  	case "running":
   253  		return WRJobStatusRunning
   254  	case "lost":
   255  		return WRJobStatusLost
   256  	case "buried":
   257  		return WRJobStatusBuried
   258  	case "complete":
   259  		return WRJobStatusComplete
   260  	default:
   261  		return WRJobStatusInvalid
   262  	}
   263  }