github.com/sentienttechnologies/studio-go-runner@v0.0.0-20201118202441-6d21f2ced8ee/internal/runner/singularity.go (about) 1 // Copyright 2018-2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License. 2 3 package runner 4 5 // This file contains the implementation of an execution module for singularity 6 // within the studioML go runner 7 // 8 9 import ( 10 "bufio" 11 "bytes" 12 "context" 13 "fmt" 14 "io/ioutil" 15 "os" 16 "os/exec" 17 "path/filepath" 18 "sort" 19 "strings" 20 "sync" 21 "text/template" 22 "time" 23 24 "github.com/go-stack/stack" 25 "github.com/jjeffery/kv" // MIT License 26 ) 27 28 // Singularity is a data structure that contains the description of a singularity container resource 29 type Singularity struct { 30 Request *Request 31 BaseDir string 32 BaseImage string 33 } 34 35 // NewSingularity is used to instantiate a singularity resource based upon a request, typically sent 36 // across a go channel or similar 37 func NewSingularity(rqst *Request, dir string) (sing *Singularity, err kv.Error) { 38 39 sing = &Singularity{ 40 Request: rqst, 41 BaseDir: dir, 42 } 43 44 art, isPresent := rqst.Experiment.Artifacts["_singularity"] 45 if !isPresent { 46 return nil, kv.NewError("_singularity artifact is missing").With("stack", stack.Trace().TrimRuntime()) 47 } 48 49 // Look for the singularity artifact and extract the base image name 50 // that will be used from shub://sentient-singularity 51 // 52 if errGo := os.MkdirAll(filepath.Join(dir, "_runner"), 0700); errGo != nil { 53 return nil, kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 54 } 55 56 os.MkdirAll(filepath.Join(dir, "..", "blob-cache"), 0700) 57 os.MkdirAll(filepath.Join(dir, "..", "queue"), 0700) 58 os.MkdirAll(filepath.Join(dir, "..", "artifact-mappings", rqst.Experiment.Key), 0700) 59 60 sing.BaseImage = art.Qualified 61 switch { 62 case strings.HasPrefix(art.Qualified, "shub://sentient-singularity/"): 63 case strings.HasPrefix(art.Qualified, "dockerhub://tensorflow/"): 64 default: 65 return nil, kv.NewError("untrusted image specified").With("stack", stack.Trace().TrimRuntime()).With("artifact", art) 66 } 67 return sing, nil 68 } 69 70 func (s *Singularity) makeDef(alloc *Allocated, e interface{}) (fn string, err kv.Error) { 71 72 // Extract all of the python variables into two collections with the studioML extracted out 73 // Ignore the tensorflow version as the container is responsible for cuda 74 pips, cfgPips, studioPIP, _ := pythonModules(s.Request, alloc) 75 76 // If the studioPIP was specified but we have a dist directory then we need to clear the 77 // studioPIP, otherwise leave it there 78 pth, errGo := filepath.Abs(filepath.Join(s.BaseDir, "workspace", "dist", "studioml-*.tar.gz")) 79 if errGo != nil { 80 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 81 } 82 matches, _ := filepath.Glob(pth) 83 if len(matches) != 0 { 84 // Extract the most recent version of studioML from the dist directory 85 sort.Strings(matches) 86 studioPIP = matches[len(matches)-1] 87 } 88 89 params := struct { 90 E interface{} 91 S *Singularity 92 I string 93 Dir string 94 Pips []string 95 CfgPips []string 96 StudioPIP string 97 ImgType string 98 }{ 99 E: e, 100 S: s, 101 I: s.BaseImage, 102 Dir: filepath.Join(s.BaseDir, "_runner"), 103 Pips: pips, 104 CfgPips: cfgPips, 105 StudioPIP: studioPIP, 106 } 107 108 switch { 109 case strings.HasPrefix(params.I, "shub://singularity-hub/sentient-singularity"): 110 params.ImgType = "debootstrap" 111 case strings.HasPrefix(params.I, "dockerhub://tensorflow/"): 112 params.ImgType = "docker" 113 params.I = strings.Replace(params.I, "dockerhub://", "", 1) 114 } 115 116 // Create a shell script that will do everything needed to run 117 // the python environment in a virtual env 118 tmpl, errGo := template.New("singularityRunner").Parse( 119 `Bootstrap: {{.ImgType}} 120 From: {{.I}} 121 122 %labels 123 ai.sentient.maintainer Karl Mutch 124 ai.sentient.version 0.0 125 126 %post 127 {{range $key, $value := .E.Request.Config.Env}} 128 echo 'export {{$key}}="{{$value}}"' >> $SINGULARITY_ENVIRONMENT 129 {{end}} 130 {{range $key, $value := .E.ExprEnvs}} 131 echo 'export {{$key}}="{{$value}}"' >> $SINGULARITY_ENVIRONMENT 132 {{end}} 133 echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/:/usr/lib/x86_64-linux-gnu:/lib/x86_64-linux-gnu/' >> $SINGULARITY_ENVIRONMENT 134 echo 'export STUDIOML_EXPERIMENT={{.E.ExprSubDir}}' >> $SINGULARITY_ENVIRONMENT 135 echo 'export STUDIOML_HOME={{.E.RootDir}}' >> $SINGULARITY_ENVIRONMENT 136 pip install virtualenv 137 virtualenv {{.Dir}} 138 chmod +x {{.Dir}}/bin/activate 139 {{.Dir}}/bin/activate 140 pip freeze 141 {{if .StudioPIP}} 142 pip install -I {{.StudioPIP}} 143 {{end}} 144 {{if .Pips}} 145 pip install -I {{range .Pips}} {{.}}{{end}} 146 {{end}} 147 pip install pyopenssl --upgrade 148 {{if .CfgPips}} 149 pip install {{range .CfgPips}} {{.}}{{end}} 150 {{end}} 151 pip freeze 152 153 %runscript 154 {{.Dir}}/bin/activate 155 cd {{.E.ExprDir}}/workspace 156 python {{.E.Request.Experiment.Filename}} {{range .E.Request.Experiment.Args}}{{.}} {{end}} 157 date 158 `) 159 160 if errGo != nil { 161 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 162 } 163 164 content := new(bytes.Buffer) 165 errGo = tmpl.Execute(content, params) 166 if errGo != nil { 167 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 168 } 169 170 fn = filepath.Join(s.BaseDir, "_runner", "Singularity.def") 171 if errGo = ioutil.WriteFile(fn, content.Bytes(), 0600); errGo != nil { 172 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 173 } 174 return fn, nil 175 } 176 177 func (s *Singularity) makeBuildScript(e interface{}) (fn string, err kv.Error) { 178 179 fn = filepath.Join(s.BaseDir, "_runner", "build.sh") 180 181 params := struct { 182 Dir string 183 BaseImage string 184 }{ 185 Dir: filepath.Join(s.BaseDir, "_runner"), 186 BaseImage: s.BaseImage, 187 } 188 189 tmpl, errGo := template.New("singularityRunner").Parse( 190 `#!/bin/bash -x 191 sudo singularity build {{.Dir}}/runner.img {{.Dir}}/Singularity.def 192 `) 193 194 if errGo != nil { 195 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 196 } 197 198 content := new(bytes.Buffer) 199 errGo = tmpl.Execute(content, params) 200 if errGo != nil { 201 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 202 } 203 204 if errGo := ioutil.WriteFile(fn, content.Bytes(), 0700); errGo != nil { 205 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 206 } 207 return fn, nil 208 } 209 210 func (s *Singularity) runBuildScript(script string) (err kv.Error) { 211 212 ctx := context.Background() 213 outputFN := filepath.Join(s.BaseDir, "output", "output") 214 215 // Move to starting the process that we will monitor with the experiment running within 216 // it 217 // 218 219 reporterC := make(chan *string) 220 defer close(reporterC) 221 222 go func() { 223 for { 224 select { 225 case msg := <-reporterC: 226 if msg == nil { 227 return 228 } 229 } 230 } 231 }() 232 233 return runWait(ctx, script, filepath.Join(s.BaseDir, "_runner"), outputFN, reporterC) 234 } 235 236 func (s *Singularity) makeExecScript(e interface{}) (fn string, err kv.Error) { 237 238 fn = filepath.Join(s.BaseDir, "_runner", "exec.sh") 239 240 params := struct { 241 Dir string 242 }{ 243 Dir: filepath.Join(s.BaseDir, "_runner"), 244 } 245 246 tmpl, errGo := template.New("singularityRunner").Parse( 247 `#!/bin/bash -x 248 singularity run --home {{.Dir}} -B /tmp:/tmp -B /usr/local/cuda:/usr/local/cuda -B /usr/lib/nvidia-384:/usr/lib/nvidia-384 --nv {{.Dir}}/runner.img 249 `) 250 251 if errGo != nil { 252 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 253 } 254 255 content := new(bytes.Buffer) 256 errGo = tmpl.Execute(content, params) 257 if errGo != nil { 258 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 259 } 260 261 if errGo := ioutil.WriteFile(fn, content.Bytes(), 0700); errGo != nil { 262 return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 263 } 264 return fn, nil 265 } 266 267 // Make is used to write a script file that is generated for the specific TF tasks studioml has sent 268 // to retrieve any python packages etc then to run the task 269 // 270 func (s *Singularity) Make(alloc *Allocated, e interface{}) (err kv.Error) { 271 272 _, err = s.makeDef(alloc, e) 273 if err != nil { 274 return err 275 } 276 277 script, err := s.makeBuildScript(e) 278 if err != nil { 279 return err 280 } 281 282 if err = s.runBuildScript(script); err != nil { 283 return err 284 } 285 286 if _, err = s.makeExecScript(e); err != nil { 287 return err 288 } 289 290 return nil 291 } 292 293 // Run will use a generated script file and will run it to completion while marshalling 294 // results and files from the computation. Run is a blocking call and will only return 295 // upon completion or termination of the process it starts 296 // 297 func (s *Singularity) Run(ctx context.Context, refresh map[string]Artifact) (err kv.Error) { 298 299 outputFN := filepath.Join(s.BaseDir, "output", "output") 300 script := filepath.Join(s.BaseDir, "_runner", "exec.sh") 301 302 reporterC := make(chan *string) 303 defer close(reporterC) 304 305 go func() { 306 for { 307 select { 308 case msg := <-reporterC: 309 if msg == nil { 310 return 311 } 312 } 313 } 314 }() 315 316 return runWait(ctx, script, filepath.Join(s.BaseDir, "_runner"), outputFN, reporterC) 317 } 318 319 func runWait(ctx context.Context, script string, dir string, outputFN string, errorC chan *string) (err kv.Error) { 320 321 stopCmd, stopCmdCancel := context.WithCancel(context.Background()) 322 // defers are stacked in LIFO order so cancelling this context is the last 323 // thing this function will do 324 defer stopCmdCancel() 325 326 // Move to starting the process that we will monitor with the experiment running within 327 // it 328 329 // #nosec 330 cmd := exec.Command("/bin/bash", "-c", filepath.Clean(script)) 331 cmd.Dir = dir 332 333 stdout, errGo := cmd.StdoutPipe() 334 if errGo != nil { 335 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 336 } 337 stderr, errGo := cmd.StderrPipe() 338 if errGo != nil { 339 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 340 } 341 342 outC := make(chan []byte) 343 defer close(outC) 344 errC := make(chan string) 345 defer close(errC) 346 347 f, errGo := os.Create(outputFN) 348 if errGo != nil { 349 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()).With("outputFN", outputFN) 350 } 351 352 stopCopy := make(chan struct{}, 1) 353 go procOutput(stopCopy, f, outC, errC) 354 355 if errGo = cmd.Start(); err != nil { 356 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 357 } 358 359 waitOnIO := sync.WaitGroup{} 360 waitOnIO.Add(2) 361 362 go func() { 363 defer waitOnIO.Done() 364 time.Sleep(time.Second) 365 s := bufio.NewScanner(stdout) 366 s.Split(bufio.ScanRunes) 367 for s.Scan() { 368 outC <- s.Bytes() 369 } 370 if errGo := s.Err(); errGo != nil { 371 if err != nil { 372 err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 373 } 374 } 375 }() 376 377 go func() { 378 defer waitOnIO.Done() 379 time.Sleep(time.Second) 380 s := bufio.NewScanner(stderr) 381 s.Split(bufio.ScanLines) 382 for s.Scan() { 383 errC <- s.Text() 384 } 385 if errGo := s.Err(); errGo != nil { 386 if err != nil { 387 err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 388 } 389 } 390 }() 391 392 go func() { 393 for { 394 select { 395 case <-ctx.Done(): 396 if errGo := cmd.Process.Kill(); errGo != nil { 397 msg := fmt.Sprintf("could not be killed, maximum life time reached, due to %v", errGo) 398 select { 399 case errorC <- &msg: 400 default: 401 } 402 return 403 } 404 msg := "killed, maximum life time reached" 405 select { 406 case errorC <- &msg: 407 default: 408 } 409 return 410 case <-stopCmd.Done(): 411 return 412 } 413 } 414 }() 415 416 if errGo = cmd.Wait(); err != nil { 417 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 418 } 419 420 waitOnIO.Wait() 421 close(stopCopy) 422 423 if err == nil && ctx.Err() != nil { 424 err = kv.Wrap(ctx.Err()).With("stack", stack.Trace().TrimRuntime()) 425 } 426 427 return err 428 } 429 430 // Close is a stub method for termination of a singularity resource 431 func (*Singularity) Close() (err kv.Error) { 432 return nil 433 }