github.com/sentienttechnologies/studio-go-runner@v0.0.0-20201118202441-6d21f2ced8ee/internal/runner/pythonenv.go (about)

     1  // Copyright 2018-2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License.
     2  
     3  package runner
     4  
     5  // This file contains the implementation of the python based virtualenv
     6  // runtime for studioML workloads
     7  
     8  import (
     9  	"bufio"
    10  	"bytes"
    11  	"context"
    12  	"fmt"
    13  	"io/ioutil"
    14  	"os"
    15  	"os/exec"
    16  	"path"
    17  	"path/filepath"
    18  	"sort"
    19  	"strconv"
    20  	"strings"
    21  	"sync"
    22  	"text/template"
    23  	"time"
    24  
    25  	"github.com/go-stack/stack"
    26  	"github.com/jjeffery/kv" // MIT License
    27  )
    28  
    29  var (
    30  	hostname string
    31  )
    32  
    33  func init() {
    34  	hostname, _ = os.Hostname()
    35  }
    36  
    37  // VirtualEnv encapsulated the context that a python virtual environment is to be
    38  // instantiated from including items such as the list of pip installables that should
    39  // be loaded and shell script to run.
    40  //
    41  type VirtualEnv struct {
    42  	Request *Request
    43  	Script  string
    44  }
    45  
    46  // NewVirtualEnv builds the VirtualEnv data structure from data received across the wire
    47  // from a studioml client.
    48  //
    49  func NewVirtualEnv(rqst *Request, dir string) (env *VirtualEnv, err kv.Error) {
    50  
    51  	if errGo := os.MkdirAll(filepath.Join(dir, "_runner"), 0700); errGo != nil {
    52  		return nil, kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
    53  	}
    54  
    55  	return &VirtualEnv{
    56  		Request: rqst,
    57  		Script:  filepath.Join(dir, "_runner", "runner.sh"),
    58  	}, nil
    59  }
    60  
    61  // pythonModules is used to scan the pip installables and to groom them based upon a
    62  // local distribution of studioML also being included inside the workspace
    63  //
    64  func pythonModules(rqst *Request, alloc *Allocated) (general []string, configured []string, studioML string, tfVer string) {
    65  
    66  	hasGPU := len(alloc.GPU) != 0
    67  
    68  	general = []string{}
    69  
    70  	gpuSeen := false
    71  	for _, pkg := range rqst.Experiment.Pythonenv {
    72  		if strings.HasPrefix(pkg, "studioml==") {
    73  			studioML = pkg
    74  			continue
    75  		}
    76  		// https://bugs.launchpad.net/ubuntu/+source/python-pip/+bug/1635463
    77  		//
    78  		// Groom out bogus package from ubuntu
    79  		if strings.HasPrefix(pkg, "pkg-resources") {
    80  			continue
    81  		}
    82  		if strings.HasPrefix(pkg, "tensorflow_gpu") {
    83  			gpuSeen = true
    84  		}
    85  
    86  		if hasGPU && !gpuSeen {
    87  			if strings.HasPrefix(pkg, "tensorflow==") || pkg == "tensorflow" {
    88  				spec := strings.Split(pkg, "==")
    89  
    90  				if len(spec) < 2 {
    91  					pkg = "tensorflow_gpu"
    92  				} else {
    93  					pkg = "tensorflow_gpu==" + spec[1]
    94  					tfVer = spec[1]
    95  				}
    96  				fmt.Printf("modified tensorflow in general %+v \n", pkg)
    97  			}
    98  		}
    99  		general = append(general, pkg)
   100  	}
   101  
   102  	configured = []string{}
   103  	for _, pkg := range rqst.Config.Pip {
   104  		if strings.HasPrefix(pkg, "studioml==") {
   105  			studioML = pkg
   106  			continue
   107  		}
   108  		if strings.HasPrefix(pkg, "pkg-resources") {
   109  			continue
   110  		}
   111  		if strings.HasPrefix(pkg, "tensorflow_gpu") {
   112  			gpuSeen = true
   113  		}
   114  		if hasGPU && !gpuSeen {
   115  			if strings.HasPrefix(pkg, "tensorflow==") || pkg == "tensorflow" {
   116  				spec := strings.Split(pkg, "==")
   117  
   118  				if len(spec) < 2 {
   119  					pkg = "tensorflow_gpu"
   120  				} else {
   121  					pkg = "tensorflow_gpu==" + spec[1]
   122  					tfVer = spec[1]
   123  				}
   124  				fmt.Printf("modified tensorflow in configured %+v \n", pkg)
   125  			}
   126  		}
   127  		configured = append(configured, pkg)
   128  	}
   129  
   130  	return general, configured, studioML, tfVer
   131  }
   132  
   133  // Make is used to write a script file that is generated for the specific TF tasks studioml has sent
   134  // to retrieve any python packages etc then to run the task
   135  //
   136  func (p *VirtualEnv) Make(alloc *Allocated, e interface{}) (err kv.Error) {
   137  
   138  	pips, cfgPips, studioPIP, tfVer := pythonModules(p.Request, alloc)
   139  
   140  	// The tensorflow versions 1.5.x and above all support cuda 9 and 1.4.x is cuda 8,
   141  	// c.f. https://www.tensorflow.org/install/install_sources#tested_source_configurations.
   142  	// Insert the appropriate version explicitly into the LD_LIBRARY_PATH before other paths
   143  	cudaDir := "/usr/local/cuda-10.0/lib64"
   144  	if strings.HasPrefix(tfVer, "1.4") {
   145  		cudaDir = "/usr/local/cuda-8.0/lib64"
   146  	}
   147  
   148  	// If the studioPIP was specified but we have a dist directory then we need to clear the
   149  	// studioPIP, otherwise leave it there
   150  	pth, errGo := filepath.Abs(filepath.Join(path.Dir(p.Script), "..", "workspace", "dist", "studioml-*.tar.gz"))
   151  	if errGo != nil {
   152  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()).With("path", pth)
   153  	}
   154  	matches, errGo := filepath.Glob(pth)
   155  	if errGo != nil {
   156  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()).With("path", pth)
   157  	}
   158  	if len(matches) != 0 {
   159  		// Extract the most recent version of studioML from the dist directory
   160  		sort.Strings(matches)
   161  		studioPIP = matches[len(matches)-1]
   162  	}
   163  
   164  	params := struct {
   165  		AllocEnv  []string
   166  		E         interface{}
   167  		Pips      []string
   168  		CfgPips   []string
   169  		StudioPIP string
   170  		CudaDir   string
   171  		Hostname  string
   172  	}{
   173  		AllocEnv:  []string{},
   174  		E:         e,
   175  		Pips:      pips,
   176  		CfgPips:   cfgPips,
   177  		StudioPIP: studioPIP,
   178  		CudaDir:   cudaDir,
   179  		Hostname:  hostname,
   180  	}
   181  
   182  	if alloc.CPU != nil {
   183  		if alloc.CPU.cores > 1 {
   184  			params.AllocEnv = append(params.AllocEnv, "OPENMP=True")
   185  			params.AllocEnv = append(params.AllocEnv, "MKL_NUM_THREADS="+strconv.Itoa(int(alloc.CPU.cores)-1))
   186  			params.AllocEnv = append(params.AllocEnv, "GOTO_NUM_THREADS="+strconv.Itoa(int(alloc.CPU.cores)-1))
   187  			params.AllocEnv = append(params.AllocEnv, "OMP_NUM_THREADS="+strconv.Itoa(int(alloc.CPU.cores)-1))
   188  		}
   189  	}
   190  
   191  	if len(alloc.GPU) != 0 {
   192  		for _, resource := range alloc.GPU {
   193  			for k, v := range resource.Env {
   194  				params.AllocEnv = append(params.AllocEnv, k+"="+v)
   195  			}
   196  		}
   197  	} else {
   198  		// Force CUDA GPUs offline manually rather than leaving this undefined
   199  		params.AllocEnv = append(params.AllocEnv, "CUDA_VISIBLE_DEVICES=\"-1\"")
   200  		params.AllocEnv = append(params.AllocEnv, "NVIDIA_VISIBLE_DEVICES=\"-1\"")
   201  	}
   202  
   203  	// Create a shell script that will do everything needed to run
   204  	// the python environment in a virtual env
   205  	tmpl, errGo := template.New("pythonRunner").Parse(
   206  		`#!/bin/bash -x
   207  sleep 2
   208  # Credit https://github.com/fernandoacorreia/azure-docker-registry/blob/master/tools/scripts/create-registry-server
   209  function fail {
   210    echo $1 >&2
   211    exit 1
   212  }
   213  
   214  trap 'fail "The execution was aborted because a command exited with an error status code."' ERR
   215  
   216  function retry {
   217    local n=1
   218    local max=3
   219    local delay=10
   220    while true; do
   221      "$@" && break || {
   222        if [[ $n -lt $max ]]; then
   223          ((n++))
   224          echo "Command failed. Attempt $n/$max:"
   225          sleep $delay;
   226        else
   227          fail "The command has failed after $n attempts."
   228        fi
   229      }
   230    done
   231  }
   232  
   233  set -v
   234  date
   235  date -u
   236  export LC_ALL=en_US.utf8
   237  locale
   238  hostname
   239  set -e
   240  export LD_LIBRARY_PATH={{.CudaDir}}:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/:/usr/lib/x86_64-linux-gnu:/lib/x86_64-linux-gnu/
   241  mkdir -p {{.E.RootDir}}/blob-cache
   242  mkdir -p {{.E.RootDir}}/queue
   243  mkdir -p {{.E.RootDir}}/artifact-mappings
   244  mkdir -p {{.E.RootDir}}/artifact-mappings/{{.E.Request.Experiment.Key}}
   245  export PATH=/root/.pyenv/bin:$PATH
   246  export PYENV_VERSION={{.E.Request.Experiment.PythonVer}}
   247  IFS=$'\n'; arr=( $(pyenv versions --bare | grep -v studioml || true) )
   248  for i in ${arr[@]} ; do
   249      if [[ "$i" == ${PYENV_VERSION}* ]]; then
   250  		export PYENV_VERSION=$i
   251  		echo $PYENV_VERSION
   252  	fi
   253  done
   254  eval "$(pyenv init -)"
   255  eval "$(pyenv virtualenv-init -)"
   256  pyenv doctor
   257  pyenv virtualenv-delete -f studioml-{{.E.ExprSubDir}} || true
   258  pyenv virtualenv $PYENV_VERSION studioml-{{.E.ExprSubDir}}
   259  pyenv activate studioml-{{.E.ExprSubDir}}
   260  set +e
   261  retry python -m pip install "pip==20.0.2"
   262  pip freeze --all
   263  {{if .StudioPIP}}
   264  retry python -m pip install -I {{.StudioPIP}}
   265  {{end}}
   266  {{if .Pips}}
   267  echo "installing project pip {{ .Pips }}"
   268  retry python -m pip install {{range .Pips }} {{.}}{{end}}
   269  {{end}}
   270  echo "finished installing project pips"
   271  retry python -m pip install pyopenssl pipdeptree --upgrade
   272  {{if .CfgPips}}
   273  echo "installing cfg pips"
   274  retry python -m pip install {{range .CfgPips}} {{.}}{{end}}
   275  echo "finished installing cfg pips"
   276  {{end}}
   277  set -e
   278  export STUDIOML_EXPERIMENT={{.E.ExprSubDir}}
   279  export STUDIOML_HOME={{.E.RootDir}}
   280  {{if .AllocEnv}}
   281  {{range .AllocEnv}}
   282  export {{.}}
   283  {{end}}
   284  {{end}}
   285  export
   286  cd {{.E.ExprDir}}/workspace
   287  pip freeze
   288  pip -V
   289  set -x
   290  set -e
   291  echo "{\"studioml\": { \"experiment\" : {\"key\": \"{{.E.Request.Experiment.Key}}\", \"project\": \"{{.E.Request.Experiment.Project}}\"}}}" | jq -c '.'
   292  {{range $key, $value := .E.Request.Experiment.Artifacts}}
   293  echo "{\"studioml\": { \"artifacts\" : {\"{{$key}}\": \"{{$value.Qualified}}\"}}}" | jq -c '.'
   294  {{end}}
   295  echo "{\"studioml\": {\"start_time\": \"` + "`" + `date '+%FT%T.%N%:z'` + "`" + `\"}}" | jq -c '.'
   296  echo "{\"studioml\": {\"host\": \"{{.Hostname}}\"}}" | jq -c '.'
   297  nvidia-smi 2>/dev/null || true
   298  python {{.E.Request.Experiment.Filename}} {{range .E.Request.Experiment.Args}}{{.}} {{end}}
   299  result=$?
   300  echo $result
   301  set +e
   302  echo "{\"studioml\": {\"stop_time\": \"` + "`" + `date '+%FT%T.%N%:z'` + "`" + `\"}}" | jq -c '.'
   303  cd -
   304  locale
   305  pyenv deactivate || true
   306  pyenv virtualenv-delete -f studioml-{{.E.ExprSubDir}} || true
   307  date
   308  date -u
   309  nvidia-smi 2>/dev/null || true
   310  exit $result
   311  `)
   312  
   313  	if errGo != nil {
   314  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   315  	}
   316  
   317  	content := new(bytes.Buffer)
   318  	if errGo = tmpl.Execute(content, params); errGo != nil {
   319  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   320  	}
   321  
   322  	if errGo = ioutil.WriteFile(p.Script, content.Bytes(), 0700); errGo != nil {
   323  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()).With("script", p.Script)
   324  	}
   325  	return nil
   326  }
   327  
   328  func procOutput(stopWriter chan struct{}, f *os.File, outC chan []byte, errC chan string) {
   329  
   330  	outLine := []byte{}
   331  
   332  	defer func() {
   333  		if len(outLine) != 0 {
   334  			f.WriteString(string(outLine))
   335  		}
   336  		f.Close()
   337  	}()
   338  
   339  	refresh := time.NewTicker(2 * time.Second)
   340  	defer refresh.Stop()
   341  
   342  	for {
   343  		select {
   344  		case <-refresh.C:
   345  			if len(outLine) != 0 {
   346  				f.WriteString(string(outLine))
   347  				outLine = []byte{}
   348  			}
   349  		case <-stopWriter:
   350  			return
   351  		case r := <-outC:
   352  			if len(r) != 0 {
   353  				outLine = append(outLine, r...)
   354  				if !bytes.Contains([]byte{'\n'}, r) {
   355  					continue
   356  				}
   357  			}
   358  			if len(outLine) != 0 {
   359  				f.WriteString(string(outLine))
   360  				outLine = []byte{}
   361  			}
   362  		case errLine := <-errC:
   363  			if len(errLine) != 0 {
   364  				f.WriteString(errLine + "\n")
   365  			}
   366  		}
   367  	}
   368  }
   369  
   370  // Run will use a generated script file and will run it to completion while marshalling
   371  // results and files from the computation.  Run is a blocking call and will only return
   372  // upon completion or termination of the process it starts
   373  //
   374  func (p *VirtualEnv) Run(ctx context.Context, refresh map[string]Artifact) (err kv.Error) {
   375  
   376  	stopCmd, stopCmdCancel := context.WithCancel(context.Background())
   377  	// defers are stacked in LIFO order so cancelling this context is the last
   378  	// thing this function will do, also cancelling the stopCmd will also travel down
   379  	// the context hierarchy cancelling everything else
   380  	defer stopCmdCancel()
   381  
   382  	// Cancel our own internal context when the outer context is cancelled
   383  	go func() {
   384  		select {
   385  		case <-stopCmd.Done():
   386  		case <-ctx.Done():
   387  		}
   388  		stopCmdCancel()
   389  	}()
   390  
   391  	// Create a new TMPDIR because the python pip tends to leave dirt behind
   392  	// when doing pip builds etc
   393  	tmpDir, errGo := ioutil.TempDir("", p.Request.Experiment.Key)
   394  	if errGo != nil {
   395  		return kv.Wrap(errGo).With("experimentKey", p.Request.Experiment.Key).With("stack", stack.Trace().TrimRuntime())
   396  	}
   397  	defer os.RemoveAll(tmpDir)
   398  
   399  	// Move to starting the process that we will monitor with the experiment running within
   400  	// it
   401  
   402  	// #nosec
   403  	cmd := exec.CommandContext(stopCmd, "/bin/bash", "-c", "export TMPDIR="+tmpDir+"; "+filepath.Clean(p.Script))
   404  	cmd.Dir = path.Dir(p.Script)
   405  
   406  	// Pipes are used to allow the output to be tracked interactively from the cmd
   407  	stdout, errGo := cmd.StdoutPipe()
   408  	if errGo != nil {
   409  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   410  	}
   411  	stderr, errGo := cmd.StderrPipe()
   412  	if errGo != nil {
   413  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   414  	}
   415  
   416  	outC := make(chan []byte)
   417  	defer close(outC)
   418  	errC := make(chan string)
   419  	defer close(errC)
   420  
   421  	// Prepare an output file into which the command line stdout and stderr will be written
   422  	outputFN := filepath.Join(cmd.Dir, "..", "output", "output")
   423  	f, errGo := os.Create(outputFN)
   424  	if errGo != nil {
   425  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   426  	}
   427  
   428  	// A quit channel is used to allow fine grained control over when the IO
   429  	// copy and output task should be created
   430  	stopOutput := make(chan struct{}, 1)
   431  
   432  	// Being the go routine that takes cmd output and appends it to a file on disk
   433  	go procOutput(stopOutput, f, outC, errC)
   434  
   435  	// Start begins the processing asynchronously, the procOutput above will collect the
   436  	// run results are they are output asynchronously
   437  	if errGo = cmd.Start(); errGo != nil {
   438  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   439  	}
   440  
   441  	// Protect the err value when running multiple goroutines
   442  	errCheck := sync.Mutex{}
   443  
   444  	// This code connects the pipes being used by the golang exec command process to the channels that
   445  	// will be used to bring the output into a single file
   446  	waitOnIO := sync.WaitGroup{}
   447  	waitOnIO.Add(2)
   448  
   449  	go func() {
   450  		defer waitOnIO.Done()
   451  
   452  		time.Sleep(time.Second)
   453  		s := bufio.NewScanner(stdout)
   454  		s.Split(bufio.ScanRunes)
   455  		for s.Scan() {
   456  			outC <- s.Bytes()
   457  		}
   458  		if errGo := s.Err(); errGo != nil {
   459  			errCheck.Lock()
   460  			defer errCheck.Unlock()
   461  			if err != nil && err != os.ErrClosed {
   462  				err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   463  			}
   464  		}
   465  	}()
   466  
   467  	go func() {
   468  		defer waitOnIO.Done()
   469  
   470  		time.Sleep(time.Second)
   471  		s := bufio.NewScanner(stderr)
   472  		s.Split(bufio.ScanLines)
   473  		for s.Scan() {
   474  			errC <- s.Text()
   475  		}
   476  		if errGo := s.Err(); errGo != nil {
   477  			errCheck.Lock()
   478  			defer errCheck.Unlock()
   479  			if err != nil && err != os.ErrClosed {
   480  				err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   481  			}
   482  		}
   483  	}()
   484  
   485  	// Wait for the IO to stop before continuing to tell the background
   486  	// writer to terminate. This means the IO for the process will
   487  	// be able to send on the channels until they have stopped.
   488  	waitOnIO.Wait()
   489  
   490  	// Now manually stop the process output copy goroutine once the exec package
   491  	// has finished
   492  	close(stopOutput)
   493  
   494  	// Wait for the process to exit, and store any error code if possible
   495  	// before we continue to wait on the processes output devices finishing
   496  	if errGo = cmd.Wait(); errGo != nil {
   497  		errCheck.Lock()
   498  		if err == nil {
   499  			err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   500  		}
   501  		errCheck.Unlock()
   502  	}
   503  
   504  	errCheck.Lock()
   505  	if err == nil && stopCmd.Err() != nil {
   506  		err = kv.Wrap(stopCmd.Err()).With("stack", stack.Trace().TrimRuntime())
   507  	}
   508  	errCheck.Unlock()
   509  
   510  	return err
   511  }
   512  
   513  // Close is used to close any resources which the encapsulated VirtualEnv may have consumed.
   514  //
   515  func (*VirtualEnv) Close() (err kv.Error) {
   516  	return nil
   517  }