github.com/sentienttechnologies/studio-go-runner@v0.0.0-20201118202441-6d21f2ced8ee/internal/runner/pythonenv.go (about) 1 // Copyright 2018-2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License. 2 3 package runner 4 5 // This file contains the implementation of the python based virtualenv 6 // runtime for studioML workloads 7 8 import ( 9 "bufio" 10 "bytes" 11 "context" 12 "fmt" 13 "io/ioutil" 14 "os" 15 "os/exec" 16 "path" 17 "path/filepath" 18 "sort" 19 "strconv" 20 "strings" 21 "sync" 22 "text/template" 23 "time" 24 25 "github.com/go-stack/stack" 26 "github.com/jjeffery/kv" // MIT License 27 ) 28 29 var ( 30 hostname string 31 ) 32 33 func init() { 34 hostname, _ = os.Hostname() 35 } 36 37 // VirtualEnv encapsulated the context that a python virtual environment is to be 38 // instantiated from including items such as the list of pip installables that should 39 // be loaded and shell script to run. 40 // 41 type VirtualEnv struct { 42 Request *Request 43 Script string 44 } 45 46 // NewVirtualEnv builds the VirtualEnv data structure from data received across the wire 47 // from a studioml client. 48 // 49 func NewVirtualEnv(rqst *Request, dir string) (env *VirtualEnv, err kv.Error) { 50 51 if errGo := os.MkdirAll(filepath.Join(dir, "_runner"), 0700); errGo != nil { 52 return nil, kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 53 } 54 55 return &VirtualEnv{ 56 Request: rqst, 57 Script: filepath.Join(dir, "_runner", "runner.sh"), 58 }, nil 59 } 60 61 // pythonModules is used to scan the pip installables and to groom them based upon a 62 // local distribution of studioML also being included inside the workspace 63 // 64 func pythonModules(rqst *Request, alloc *Allocated) (general []string, configured []string, studioML string, tfVer string) { 65 66 hasGPU := len(alloc.GPU) != 0 67 68 general = []string{} 69 70 gpuSeen := false 71 for _, pkg := range rqst.Experiment.Pythonenv { 72 if strings.HasPrefix(pkg, "studioml==") { 73 studioML = pkg 74 continue 75 } 76 // https://bugs.launchpad.net/ubuntu/+source/python-pip/+bug/1635463 77 // 78 // Groom out bogus package from ubuntu 79 if strings.HasPrefix(pkg, "pkg-resources") { 80 continue 81 } 82 if strings.HasPrefix(pkg, "tensorflow_gpu") { 83 gpuSeen = true 84 } 85 86 if hasGPU && !gpuSeen { 87 if strings.HasPrefix(pkg, "tensorflow==") || pkg == "tensorflow" { 88 spec := strings.Split(pkg, "==") 89 90 if len(spec) < 2 { 91 pkg = "tensorflow_gpu" 92 } else { 93 pkg = "tensorflow_gpu==" + spec[1] 94 tfVer = spec[1] 95 } 96 fmt.Printf("modified tensorflow in general %+v \n", pkg) 97 } 98 } 99 general = append(general, pkg) 100 } 101 102 configured = []string{} 103 for _, pkg := range rqst.Config.Pip { 104 if strings.HasPrefix(pkg, "studioml==") { 105 studioML = pkg 106 continue 107 } 108 if strings.HasPrefix(pkg, "pkg-resources") { 109 continue 110 } 111 if strings.HasPrefix(pkg, "tensorflow_gpu") { 112 gpuSeen = true 113 } 114 if hasGPU && !gpuSeen { 115 if strings.HasPrefix(pkg, "tensorflow==") || pkg == "tensorflow" { 116 spec := strings.Split(pkg, "==") 117 118 if len(spec) < 2 { 119 pkg = "tensorflow_gpu" 120 } else { 121 pkg = "tensorflow_gpu==" + spec[1] 122 tfVer = spec[1] 123 } 124 fmt.Printf("modified tensorflow in configured %+v \n", pkg) 125 } 126 } 127 configured = append(configured, pkg) 128 } 129 130 return general, configured, studioML, tfVer 131 } 132 133 // Make is used to write a script file that is generated for the specific TF tasks studioml has sent 134 // to retrieve any python packages etc then to run the task 135 // 136 func (p *VirtualEnv) Make(alloc *Allocated, e interface{}) (err kv.Error) { 137 138 pips, cfgPips, studioPIP, tfVer := pythonModules(p.Request, alloc) 139 140 // The tensorflow versions 1.5.x and above all support cuda 9 and 1.4.x is cuda 8, 141 // c.f. https://www.tensorflow.org/install/install_sources#tested_source_configurations. 142 // Insert the appropriate version explicitly into the LD_LIBRARY_PATH before other paths 143 cudaDir := "/usr/local/cuda-10.0/lib64" 144 if strings.HasPrefix(tfVer, "1.4") { 145 cudaDir = "/usr/local/cuda-8.0/lib64" 146 } 147 148 // If the studioPIP was specified but we have a dist directory then we need to clear the 149 // studioPIP, otherwise leave it there 150 pth, errGo := filepath.Abs(filepath.Join(path.Dir(p.Script), "..", "workspace", "dist", "studioml-*.tar.gz")) 151 if errGo != nil { 152 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()).With("path", pth) 153 } 154 matches, errGo := filepath.Glob(pth) 155 if errGo != nil { 156 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()).With("path", pth) 157 } 158 if len(matches) != 0 { 159 // Extract the most recent version of studioML from the dist directory 160 sort.Strings(matches) 161 studioPIP = matches[len(matches)-1] 162 } 163 164 params := struct { 165 AllocEnv []string 166 E interface{} 167 Pips []string 168 CfgPips []string 169 StudioPIP string 170 CudaDir string 171 Hostname string 172 }{ 173 AllocEnv: []string{}, 174 E: e, 175 Pips: pips, 176 CfgPips: cfgPips, 177 StudioPIP: studioPIP, 178 CudaDir: cudaDir, 179 Hostname: hostname, 180 } 181 182 if alloc.CPU != nil { 183 if alloc.CPU.cores > 1 { 184 params.AllocEnv = append(params.AllocEnv, "OPENMP=True") 185 params.AllocEnv = append(params.AllocEnv, "MKL_NUM_THREADS="+strconv.Itoa(int(alloc.CPU.cores)-1)) 186 params.AllocEnv = append(params.AllocEnv, "GOTO_NUM_THREADS="+strconv.Itoa(int(alloc.CPU.cores)-1)) 187 params.AllocEnv = append(params.AllocEnv, "OMP_NUM_THREADS="+strconv.Itoa(int(alloc.CPU.cores)-1)) 188 } 189 } 190 191 if len(alloc.GPU) != 0 { 192 for _, resource := range alloc.GPU { 193 for k, v := range resource.Env { 194 params.AllocEnv = append(params.AllocEnv, k+"="+v) 195 } 196 } 197 } else { 198 // Force CUDA GPUs offline manually rather than leaving this undefined 199 params.AllocEnv = append(params.AllocEnv, "CUDA_VISIBLE_DEVICES=\"-1\"") 200 params.AllocEnv = append(params.AllocEnv, "NVIDIA_VISIBLE_DEVICES=\"-1\"") 201 } 202 203 // Create a shell script that will do everything needed to run 204 // the python environment in a virtual env 205 tmpl, errGo := template.New("pythonRunner").Parse( 206 `#!/bin/bash -x 207 sleep 2 208 # Credit https://github.com/fernandoacorreia/azure-docker-registry/blob/master/tools/scripts/create-registry-server 209 function fail { 210 echo $1 >&2 211 exit 1 212 } 213 214 trap 'fail "The execution was aborted because a command exited with an error status code."' ERR 215 216 function retry { 217 local n=1 218 local max=3 219 local delay=10 220 while true; do 221 "$@" && break || { 222 if [[ $n -lt $max ]]; then 223 ((n++)) 224 echo "Command failed. Attempt $n/$max:" 225 sleep $delay; 226 else 227 fail "The command has failed after $n attempts." 228 fi 229 } 230 done 231 } 232 233 set -v 234 date 235 date -u 236 export LC_ALL=en_US.utf8 237 locale 238 hostname 239 set -e 240 export LD_LIBRARY_PATH={{.CudaDir}}:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/:/usr/lib/x86_64-linux-gnu:/lib/x86_64-linux-gnu/ 241 mkdir -p {{.E.RootDir}}/blob-cache 242 mkdir -p {{.E.RootDir}}/queue 243 mkdir -p {{.E.RootDir}}/artifact-mappings 244 mkdir -p {{.E.RootDir}}/artifact-mappings/{{.E.Request.Experiment.Key}} 245 export PATH=/root/.pyenv/bin:$PATH 246 export PYENV_VERSION={{.E.Request.Experiment.PythonVer}} 247 IFS=$'\n'; arr=( $(pyenv versions --bare | grep -v studioml || true) ) 248 for i in ${arr[@]} ; do 249 if [[ "$i" == ${PYENV_VERSION}* ]]; then 250 export PYENV_VERSION=$i 251 echo $PYENV_VERSION 252 fi 253 done 254 eval "$(pyenv init -)" 255 eval "$(pyenv virtualenv-init -)" 256 pyenv doctor 257 pyenv virtualenv-delete -f studioml-{{.E.ExprSubDir}} || true 258 pyenv virtualenv $PYENV_VERSION studioml-{{.E.ExprSubDir}} 259 pyenv activate studioml-{{.E.ExprSubDir}} 260 set +e 261 retry python -m pip install "pip==20.0.2" 262 pip freeze --all 263 {{if .StudioPIP}} 264 retry python -m pip install -I {{.StudioPIP}} 265 {{end}} 266 {{if .Pips}} 267 echo "installing project pip {{ .Pips }}" 268 retry python -m pip install {{range .Pips }} {{.}}{{end}} 269 {{end}} 270 echo "finished installing project pips" 271 retry python -m pip install pyopenssl pipdeptree --upgrade 272 {{if .CfgPips}} 273 echo "installing cfg pips" 274 retry python -m pip install {{range .CfgPips}} {{.}}{{end}} 275 echo "finished installing cfg pips" 276 {{end}} 277 set -e 278 export STUDIOML_EXPERIMENT={{.E.ExprSubDir}} 279 export STUDIOML_HOME={{.E.RootDir}} 280 {{if .AllocEnv}} 281 {{range .AllocEnv}} 282 export {{.}} 283 {{end}} 284 {{end}} 285 export 286 cd {{.E.ExprDir}}/workspace 287 pip freeze 288 pip -V 289 set -x 290 set -e 291 echo "{\"studioml\": { \"experiment\" : {\"key\": \"{{.E.Request.Experiment.Key}}\", \"project\": \"{{.E.Request.Experiment.Project}}\"}}}" | jq -c '.' 292 {{range $key, $value := .E.Request.Experiment.Artifacts}} 293 echo "{\"studioml\": { \"artifacts\" : {\"{{$key}}\": \"{{$value.Qualified}}\"}}}" | jq -c '.' 294 {{end}} 295 echo "{\"studioml\": {\"start_time\": \"` + "`" + `date '+%FT%T.%N%:z'` + "`" + `\"}}" | jq -c '.' 296 echo "{\"studioml\": {\"host\": \"{{.Hostname}}\"}}" | jq -c '.' 297 nvidia-smi 2>/dev/null || true 298 python {{.E.Request.Experiment.Filename}} {{range .E.Request.Experiment.Args}}{{.}} {{end}} 299 result=$? 300 echo $result 301 set +e 302 echo "{\"studioml\": {\"stop_time\": \"` + "`" + `date '+%FT%T.%N%:z'` + "`" + `\"}}" | jq -c '.' 303 cd - 304 locale 305 pyenv deactivate || true 306 pyenv virtualenv-delete -f studioml-{{.E.ExprSubDir}} || true 307 date 308 date -u 309 nvidia-smi 2>/dev/null || true 310 exit $result 311 `) 312 313 if errGo != nil { 314 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 315 } 316 317 content := new(bytes.Buffer) 318 if errGo = tmpl.Execute(content, params); errGo != nil { 319 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 320 } 321 322 if errGo = ioutil.WriteFile(p.Script, content.Bytes(), 0700); errGo != nil { 323 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()).With("script", p.Script) 324 } 325 return nil 326 } 327 328 func procOutput(stopWriter chan struct{}, f *os.File, outC chan []byte, errC chan string) { 329 330 outLine := []byte{} 331 332 defer func() { 333 if len(outLine) != 0 { 334 f.WriteString(string(outLine)) 335 } 336 f.Close() 337 }() 338 339 refresh := time.NewTicker(2 * time.Second) 340 defer refresh.Stop() 341 342 for { 343 select { 344 case <-refresh.C: 345 if len(outLine) != 0 { 346 f.WriteString(string(outLine)) 347 outLine = []byte{} 348 } 349 case <-stopWriter: 350 return 351 case r := <-outC: 352 if len(r) != 0 { 353 outLine = append(outLine, r...) 354 if !bytes.Contains([]byte{'\n'}, r) { 355 continue 356 } 357 } 358 if len(outLine) != 0 { 359 f.WriteString(string(outLine)) 360 outLine = []byte{} 361 } 362 case errLine := <-errC: 363 if len(errLine) != 0 { 364 f.WriteString(errLine + "\n") 365 } 366 } 367 } 368 } 369 370 // Run will use a generated script file and will run it to completion while marshalling 371 // results and files from the computation. Run is a blocking call and will only return 372 // upon completion or termination of the process it starts 373 // 374 func (p *VirtualEnv) Run(ctx context.Context, refresh map[string]Artifact) (err kv.Error) { 375 376 stopCmd, stopCmdCancel := context.WithCancel(context.Background()) 377 // defers are stacked in LIFO order so cancelling this context is the last 378 // thing this function will do, also cancelling the stopCmd will also travel down 379 // the context hierarchy cancelling everything else 380 defer stopCmdCancel() 381 382 // Cancel our own internal context when the outer context is cancelled 383 go func() { 384 select { 385 case <-stopCmd.Done(): 386 case <-ctx.Done(): 387 } 388 stopCmdCancel() 389 }() 390 391 // Create a new TMPDIR because the python pip tends to leave dirt behind 392 // when doing pip builds etc 393 tmpDir, errGo := ioutil.TempDir("", p.Request.Experiment.Key) 394 if errGo != nil { 395 return kv.Wrap(errGo).With("experimentKey", p.Request.Experiment.Key).With("stack", stack.Trace().TrimRuntime()) 396 } 397 defer os.RemoveAll(tmpDir) 398 399 // Move to starting the process that we will monitor with the experiment running within 400 // it 401 402 // #nosec 403 cmd := exec.CommandContext(stopCmd, "/bin/bash", "-c", "export TMPDIR="+tmpDir+"; "+filepath.Clean(p.Script)) 404 cmd.Dir = path.Dir(p.Script) 405 406 // Pipes are used to allow the output to be tracked interactively from the cmd 407 stdout, errGo := cmd.StdoutPipe() 408 if errGo != nil { 409 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 410 } 411 stderr, errGo := cmd.StderrPipe() 412 if errGo != nil { 413 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 414 } 415 416 outC := make(chan []byte) 417 defer close(outC) 418 errC := make(chan string) 419 defer close(errC) 420 421 // Prepare an output file into which the command line stdout and stderr will be written 422 outputFN := filepath.Join(cmd.Dir, "..", "output", "output") 423 f, errGo := os.Create(outputFN) 424 if errGo != nil { 425 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 426 } 427 428 // A quit channel is used to allow fine grained control over when the IO 429 // copy and output task should be created 430 stopOutput := make(chan struct{}, 1) 431 432 // Being the go routine that takes cmd output and appends it to a file on disk 433 go procOutput(stopOutput, f, outC, errC) 434 435 // Start begins the processing asynchronously, the procOutput above will collect the 436 // run results are they are output asynchronously 437 if errGo = cmd.Start(); errGo != nil { 438 return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 439 } 440 441 // Protect the err value when running multiple goroutines 442 errCheck := sync.Mutex{} 443 444 // This code connects the pipes being used by the golang exec command process to the channels that 445 // will be used to bring the output into a single file 446 waitOnIO := sync.WaitGroup{} 447 waitOnIO.Add(2) 448 449 go func() { 450 defer waitOnIO.Done() 451 452 time.Sleep(time.Second) 453 s := bufio.NewScanner(stdout) 454 s.Split(bufio.ScanRunes) 455 for s.Scan() { 456 outC <- s.Bytes() 457 } 458 if errGo := s.Err(); errGo != nil { 459 errCheck.Lock() 460 defer errCheck.Unlock() 461 if err != nil && err != os.ErrClosed { 462 err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 463 } 464 } 465 }() 466 467 go func() { 468 defer waitOnIO.Done() 469 470 time.Sleep(time.Second) 471 s := bufio.NewScanner(stderr) 472 s.Split(bufio.ScanLines) 473 for s.Scan() { 474 errC <- s.Text() 475 } 476 if errGo := s.Err(); errGo != nil { 477 errCheck.Lock() 478 defer errCheck.Unlock() 479 if err != nil && err != os.ErrClosed { 480 err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 481 } 482 } 483 }() 484 485 // Wait for the IO to stop before continuing to tell the background 486 // writer to terminate. This means the IO for the process will 487 // be able to send on the channels until they have stopped. 488 waitOnIO.Wait() 489 490 // Now manually stop the process output copy goroutine once the exec package 491 // has finished 492 close(stopOutput) 493 494 // Wait for the process to exit, and store any error code if possible 495 // before we continue to wait on the processes output devices finishing 496 if errGo = cmd.Wait(); errGo != nil { 497 errCheck.Lock() 498 if err == nil { 499 err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 500 } 501 errCheck.Unlock() 502 } 503 504 errCheck.Lock() 505 if err == nil && stopCmd.Err() != nil { 506 err = kv.Wrap(stopCmd.Err()).With("stack", stack.Trace().TrimRuntime()) 507 } 508 errCheck.Unlock() 509 510 return err 511 } 512 513 // Close is used to close any resources which the encapsulated VirtualEnv may have consumed. 514 // 515 func (*VirtualEnv) Close() (err kv.Error) { 516 return nil 517 }