github.com/wtsi-hgi/go-softpack-builder@v1.8.1/wr/wr.go (about) 1 /******************************************************************************* 2 * Copyright (c) 2023, 2024 Genome Research Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included 13 * in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 ******************************************************************************/ 23 24 package wr 25 26 import ( 27 "bufio" 28 "bytes" 29 _ "embed" 30 "log/slog" 31 "os/exec" 32 "strings" 33 "text/template" 34 "time" 35 ) 36 37 type WRJobStatus int 38 39 const ( 40 WRJobStatusInvalid WRJobStatus = iota 41 WRJobStatusDelayed 42 WRJobStatusReady 43 WRJobStatusReserved 44 WRJobStatusRunning 45 WRJobStatusLost 46 WRJobStatusBuried 47 WRJobStatusComplete 48 ) 49 50 const ( 51 plainStatusCols = 2 52 defaultPollDuration = 5 * time.Second 53 ) 54 55 type Error struct { 56 msg string 57 } 58 59 func (e Error) Error() string { return "wr cmd failed: " + e.msg } 60 61 //go:embed wr.tmpl 62 var wrTmplStr string 63 var wrTmpl *template.Template //nolint:gochecknoglobals 64 65 func init() { //nolint:gochecknoinits 66 wrTmpl = template.Must(template.New("").Parse(wrTmplStr)) 67 } 68 69 // SingularityBuildInS3WRInput returns wr input that could be piped to `wr add` 70 // and that would run a singularity build where the working directory is a fuse 71 // mount of the given s3Path. 72 func SingularityBuildInS3WRInput(s3Path, hash string) (string, error) { 73 var w strings.Builder 74 75 if err := wrTmpl.Execute(&w, struct { 76 S3Path, Hash string 77 }{ 78 s3Path, 79 hash, 80 }); err != nil { 81 return "", err 82 } 83 84 return w.String(), nil 85 } 86 87 // Runner lets you Run() a wr add command. 88 type Runner struct { 89 deployment string 90 memory string 91 pollDuration time.Duration 92 } 93 94 // New returns a Runner that will use the given wr deployment to wr add jobs 95 // during Run(). 96 func New(deployment string) *Runner { 97 return &Runner{ 98 deployment: deployment, 99 memory: "43G", 100 pollDuration: defaultPollDuration, 101 } 102 } 103 104 // Run pipes the given wrInput (eg. as produced by 105 // SingularityBuildInS3WRInput()) to `wr add`, which adds a job to wr's queue 106 // and returns its ID. You should call Wait(ID) to actually wait for the job to 107 // finishing running. 108 // 109 // The memory defaults to 8GB, time to 8hrs, and if the cmd in the input has 110 // previously been run, the cmd will be re-run. 111 // 112 // NB: if the cmd is a duplicate of a currently queued job, this will not 113 // generate an error, but just return the id of the existing job. 114 func (r *Runner) Add(wrInput string) (string, error) { 115 cmd := exec.Command("wr", "add", "--deployment", r.deployment, "--simple", //nolint:gosec 116 "--time", "8h", "--memory", r.memory, "-o", "2", "--rerun") 117 cmd.Stdin = strings.NewReader(wrInput) 118 119 return r.runWRCmd(cmd) 120 } 121 122 func (r *Runner) runWRCmd(cmd *exec.Cmd) (string, error) { 123 var stdout, stderr bytes.Buffer 124 125 cmd.Stdout = &stdout 126 cmd.Stderr = &stderr 127 128 err := cmd.Run() 129 slog.Debug("ran wr command", "cmd", cmd.String(), "stdout", stdout.String(), 130 "stderr", stderr.String(), "err", err, "exitcode", cmd.ProcessState.ExitCode()) 131 132 if err != nil { 133 errStr := stderr.String() 134 if !strings.Contains(errStr, "EROR") { 135 return strings.TrimSpace(stdout.String()), nil 136 } 137 138 if errStr == "" { 139 errStr = err.Error() 140 } 141 142 return "", Error{msg: errStr} 143 } 144 145 return strings.TrimSpace(stdout.String()), nil 146 } 147 148 // WaitForRunning waits until the given wr job either starts running, or exits. 149 func (r *Runner) WaitForRunning(id string) error { 150 var err error 151 152 cb := func(status WRJobStatus, cbErr error) bool { 153 err = cbErr 154 155 return err != nil || statusIsStarted(status) || statusIsExited(status) 156 } 157 158 r.pollStatus(id, cb) 159 160 return err 161 } 162 163 func statusIsStarted(status WRJobStatus) bool { 164 return status == WRJobStatusRunning || status == WRJobStatusLost 165 } 166 167 func statusIsExited(status WRJobStatus) bool { 168 return status == WRJobStatusInvalid || status == WRJobStatusBuried || status == WRJobStatusComplete 169 } 170 171 // pollStatusCallback receives a WRJobStatus and error, and should return true 172 // if you want to stop polling now. 173 type pollStatusCallback = func(WRJobStatus, error) bool 174 175 func (r *Runner) pollStatus(id string, cb pollStatusCallback) { 176 ticker := time.NewTicker(r.pollDuration) 177 defer ticker.Stop() 178 179 for range ticker.C { 180 if cb(r.Status(id)) { 181 return 182 } 183 } 184 } 185 186 // Wait waits for the given wr job to exit. 187 func (r *Runner) Wait(id string) (WRJobStatus, error) { 188 var ( 189 status WRJobStatus 190 err error 191 ) 192 193 cb := func(cbStatus WRJobStatus, cbErr error) bool { 194 status = cbStatus 195 err = cbErr 196 197 return err != nil || statusIsExited(status) 198 } 199 200 r.pollStatus(id, cb) 201 202 return status, err 203 } 204 205 // Status returns the status of the wr job with the given internal ID. 206 // 207 // Returns WRJobStatusInvalid if the ID wasn't found. Returns WRJobStatusBuried 208 // if it failed. Only returns an error if there was a problem getting the 209 // status. 210 func (r *Runner) Status(id string) (WRJobStatus, error) { 211 cmd := exec.Command("wr", "status", "--deployment", r.deployment, "-o", //nolint:gosec 212 "plain", "-i", id, "-y") 213 214 out, err := r.runWRCmd(cmd) 215 if err != nil { 216 slog.Error("wr status command failed", "err", err) 217 218 return WRJobStatusInvalid, err 219 } 220 221 return parseWRStatus(out, id) 222 } 223 224 func parseWRStatus(wrStatusOutput, id string) (WRJobStatus, error) { 225 scanner := bufio.NewScanner(strings.NewReader(wrStatusOutput)) 226 for scanner.Scan() { 227 cols := strings.Split(scanner.Text(), "\t") 228 if len(cols) != plainStatusCols { 229 continue 230 } 231 232 if cols[0] != id { 233 continue 234 } 235 236 return statusStringToType(cols[1]), nil 237 } 238 239 slog.Error("wr status parsing to find a job failed", "id", id, "err", scanner.Err()) 240 241 return WRJobStatusInvalid, scanner.Err() 242 } 243 244 func statusStringToType(status string) WRJobStatus { //nolint:gocyclo 245 switch status { 246 case "delayed": 247 return WRJobStatusDelayed 248 case "ready": 249 return WRJobStatusReady 250 case "reserved": 251 return WRJobStatusReserved 252 case "running": 253 return WRJobStatusRunning 254 case "lost": 255 return WRJobStatusLost 256 case "buried": 257 return WRJobStatusBuried 258 case "complete": 259 return WRJobStatusComplete 260 default: 261 return WRJobStatusInvalid 262 } 263 }