github.com/sentienttechnologies/studio-go-runner@v0.0.0-20201118202441-6d21f2ced8ee/internal/runner/singularity.go (about)

     1  // Copyright 2018-2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License.
     2  
     3  package runner
     4  
     5  // This file contains the implementation of an execution module for singularity
     6  // within the studioML go runner
     7  //
     8  
     9  import (
    10  	"bufio"
    11  	"bytes"
    12  	"context"
    13  	"fmt"
    14  	"io/ioutil"
    15  	"os"
    16  	"os/exec"
    17  	"path/filepath"
    18  	"sort"
    19  	"strings"
    20  	"sync"
    21  	"text/template"
    22  	"time"
    23  
    24  	"github.com/go-stack/stack"
    25  	"github.com/jjeffery/kv" // MIT License
    26  )
    27  
    28  // Singularity is a data structure that contains the description of a singularity container resource
    29  type Singularity struct {
    30  	Request   *Request
    31  	BaseDir   string
    32  	BaseImage string
    33  }
    34  
    35  // NewSingularity is used to instantiate a singularity resource based upon a request, typically sent
    36  // across a go channel or similar
    37  func NewSingularity(rqst *Request, dir string) (sing *Singularity, err kv.Error) {
    38  
    39  	sing = &Singularity{
    40  		Request: rqst,
    41  		BaseDir: dir,
    42  	}
    43  
    44  	art, isPresent := rqst.Experiment.Artifacts["_singularity"]
    45  	if !isPresent {
    46  		return nil, kv.NewError("_singularity artifact is missing").With("stack", stack.Trace().TrimRuntime())
    47  	}
    48  
    49  	// Look for the singularity artifact and extract the base image name
    50  	// that will be used from shub://sentient-singularity
    51  	//
    52  	if errGo := os.MkdirAll(filepath.Join(dir, "_runner"), 0700); errGo != nil {
    53  		return nil, kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
    54  	}
    55  
    56  	os.MkdirAll(filepath.Join(dir, "..", "blob-cache"), 0700)
    57  	os.MkdirAll(filepath.Join(dir, "..", "queue"), 0700)
    58  	os.MkdirAll(filepath.Join(dir, "..", "artifact-mappings", rqst.Experiment.Key), 0700)
    59  
    60  	sing.BaseImage = art.Qualified
    61  	switch {
    62  	case strings.HasPrefix(art.Qualified, "shub://sentient-singularity/"):
    63  	case strings.HasPrefix(art.Qualified, "dockerhub://tensorflow/"):
    64  	default:
    65  		return nil, kv.NewError("untrusted image specified").With("stack", stack.Trace().TrimRuntime()).With("artifact", art)
    66  	}
    67  	return sing, nil
    68  }
    69  
    70  func (s *Singularity) makeDef(alloc *Allocated, e interface{}) (fn string, err kv.Error) {
    71  
    72  	// Extract all of the python variables into two collections with the studioML extracted out
    73  	// Ignore the tensorflow version as the container is responsible for cuda
    74  	pips, cfgPips, studioPIP, _ := pythonModules(s.Request, alloc)
    75  
    76  	// If the studioPIP was specified but we have a dist directory then we need to clear the
    77  	// studioPIP, otherwise leave it there
    78  	pth, errGo := filepath.Abs(filepath.Join(s.BaseDir, "workspace", "dist", "studioml-*.tar.gz"))
    79  	if errGo != nil {
    80  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
    81  	}
    82  	matches, _ := filepath.Glob(pth)
    83  	if len(matches) != 0 {
    84  		// Extract the most recent version of studioML from the dist directory
    85  		sort.Strings(matches)
    86  		studioPIP = matches[len(matches)-1]
    87  	}
    88  
    89  	params := struct {
    90  		E         interface{}
    91  		S         *Singularity
    92  		I         string
    93  		Dir       string
    94  		Pips      []string
    95  		CfgPips   []string
    96  		StudioPIP string
    97  		ImgType   string
    98  	}{
    99  		E:         e,
   100  		S:         s,
   101  		I:         s.BaseImage,
   102  		Dir:       filepath.Join(s.BaseDir, "_runner"),
   103  		Pips:      pips,
   104  		CfgPips:   cfgPips,
   105  		StudioPIP: studioPIP,
   106  	}
   107  
   108  	switch {
   109  	case strings.HasPrefix(params.I, "shub://singularity-hub/sentient-singularity"):
   110  		params.ImgType = "debootstrap"
   111  	case strings.HasPrefix(params.I, "dockerhub://tensorflow/"):
   112  		params.ImgType = "docker"
   113  		params.I = strings.Replace(params.I, "dockerhub://", "", 1)
   114  	}
   115  
   116  	// Create a shell script that will do everything needed to run
   117  	// the python environment in a virtual env
   118  	tmpl, errGo := template.New("singularityRunner").Parse(
   119  		`Bootstrap: {{.ImgType}}
   120  From: {{.I}}
   121  
   122  %labels
   123  ai.sentient.maintainer Karl Mutch
   124  ai.sentient.version 0.0
   125  
   126  %post
   127  {{range $key, $value := .E.Request.Config.Env}}
   128      echo 'export {{$key}}="{{$value}}"' >> $SINGULARITY_ENVIRONMENT
   129  {{end}}
   130  {{range $key, $value := .E.ExprEnvs}}
   131      echo 'export {{$key}}="{{$value}}"' >> $SINGULARITY_ENVIRONMENT
   132  {{end}}
   133      echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/:/usr/lib/x86_64-linux-gnu:/lib/x86_64-linux-gnu/' >> $SINGULARITY_ENVIRONMENT
   134  	echo 'export STUDIOML_EXPERIMENT={{.E.ExprSubDir}}' >> $SINGULARITY_ENVIRONMENT
   135  	echo 'export STUDIOML_HOME={{.E.RootDir}}' >> $SINGULARITY_ENVIRONMENT
   136  	pip install virtualenv
   137  	virtualenv {{.Dir}}
   138  	chmod +x {{.Dir}}/bin/activate
   139  	{{.Dir}}/bin/activate
   140  	pip freeze
   141  	{{if .StudioPIP}}
   142  	pip install -I {{.StudioPIP}}
   143  	{{end}}
   144  	{{if .Pips}}
   145  	pip install -I {{range .Pips}} {{.}}{{end}}
   146  	{{end}}
   147  	pip install pyopenssl --upgrade
   148  	{{if .CfgPips}}
   149  	pip install {{range .CfgPips}} {{.}}{{end}}
   150  	{{end}}
   151  	pip freeze
   152  
   153  %runscript
   154  	{{.Dir}}/bin/activate
   155  	cd {{.E.ExprDir}}/workspace
   156  	python {{.E.Request.Experiment.Filename}} {{range .E.Request.Experiment.Args}}{{.}} {{end}}
   157  	date
   158  `)
   159  
   160  	if errGo != nil {
   161  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   162  	}
   163  
   164  	content := new(bytes.Buffer)
   165  	errGo = tmpl.Execute(content, params)
   166  	if errGo != nil {
   167  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   168  	}
   169  
   170  	fn = filepath.Join(s.BaseDir, "_runner", "Singularity.def")
   171  	if errGo = ioutil.WriteFile(fn, content.Bytes(), 0600); errGo != nil {
   172  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   173  	}
   174  	return fn, nil
   175  }
   176  
   177  func (s *Singularity) makeBuildScript(e interface{}) (fn string, err kv.Error) {
   178  
   179  	fn = filepath.Join(s.BaseDir, "_runner", "build.sh")
   180  
   181  	params := struct {
   182  		Dir       string
   183  		BaseImage string
   184  	}{
   185  		Dir:       filepath.Join(s.BaseDir, "_runner"),
   186  		BaseImage: s.BaseImage,
   187  	}
   188  
   189  	tmpl, errGo := template.New("singularityRunner").Parse(
   190  		`#!/bin/bash -x
   191  sudo singularity build {{.Dir}}/runner.img {{.Dir}}/Singularity.def
   192  `)
   193  
   194  	if errGo != nil {
   195  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   196  	}
   197  
   198  	content := new(bytes.Buffer)
   199  	errGo = tmpl.Execute(content, params)
   200  	if errGo != nil {
   201  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   202  	}
   203  
   204  	if errGo := ioutil.WriteFile(fn, content.Bytes(), 0700); errGo != nil {
   205  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   206  	}
   207  	return fn, nil
   208  }
   209  
   210  func (s *Singularity) runBuildScript(script string) (err kv.Error) {
   211  
   212  	ctx := context.Background()
   213  	outputFN := filepath.Join(s.BaseDir, "output", "output")
   214  
   215  	// Move to starting the process that we will monitor with the experiment running within
   216  	// it
   217  	//
   218  
   219  	reporterC := make(chan *string)
   220  	defer close(reporterC)
   221  
   222  	go func() {
   223  		for {
   224  			select {
   225  			case msg := <-reporterC:
   226  				if msg == nil {
   227  					return
   228  				}
   229  			}
   230  		}
   231  	}()
   232  
   233  	return runWait(ctx, script, filepath.Join(s.BaseDir, "_runner"), outputFN, reporterC)
   234  }
   235  
   236  func (s *Singularity) makeExecScript(e interface{}) (fn string, err kv.Error) {
   237  
   238  	fn = filepath.Join(s.BaseDir, "_runner", "exec.sh")
   239  
   240  	params := struct {
   241  		Dir string
   242  	}{
   243  		Dir: filepath.Join(s.BaseDir, "_runner"),
   244  	}
   245  
   246  	tmpl, errGo := template.New("singularityRunner").Parse(
   247  		`#!/bin/bash -x
   248  singularity run --home {{.Dir}} -B /tmp:/tmp -B /usr/local/cuda:/usr/local/cuda -B /usr/lib/nvidia-384:/usr/lib/nvidia-384 --nv {{.Dir}}/runner.img
   249  `)
   250  
   251  	if errGo != nil {
   252  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   253  	}
   254  
   255  	content := new(bytes.Buffer)
   256  	errGo = tmpl.Execute(content, params)
   257  	if errGo != nil {
   258  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   259  	}
   260  
   261  	if errGo := ioutil.WriteFile(fn, content.Bytes(), 0700); errGo != nil {
   262  		return "", kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   263  	}
   264  	return fn, nil
   265  }
   266  
   267  // Make is used to write a script file that is generated for the specific TF tasks studioml has sent
   268  // to retrieve any python packages etc then to run the task
   269  //
   270  func (s *Singularity) Make(alloc *Allocated, e interface{}) (err kv.Error) {
   271  
   272  	_, err = s.makeDef(alloc, e)
   273  	if err != nil {
   274  		return err
   275  	}
   276  
   277  	script, err := s.makeBuildScript(e)
   278  	if err != nil {
   279  		return err
   280  	}
   281  
   282  	if err = s.runBuildScript(script); err != nil {
   283  		return err
   284  	}
   285  
   286  	if _, err = s.makeExecScript(e); err != nil {
   287  		return err
   288  	}
   289  
   290  	return nil
   291  }
   292  
   293  // Run will use a generated script file and will run it to completion while marshalling
   294  // results and files from the computation.  Run is a blocking call and will only return
   295  // upon completion or termination of the process it starts
   296  //
   297  func (s *Singularity) Run(ctx context.Context, refresh map[string]Artifact) (err kv.Error) {
   298  
   299  	outputFN := filepath.Join(s.BaseDir, "output", "output")
   300  	script := filepath.Join(s.BaseDir, "_runner", "exec.sh")
   301  
   302  	reporterC := make(chan *string)
   303  	defer close(reporterC)
   304  
   305  	go func() {
   306  		for {
   307  			select {
   308  			case msg := <-reporterC:
   309  				if msg == nil {
   310  					return
   311  				}
   312  			}
   313  		}
   314  	}()
   315  
   316  	return runWait(ctx, script, filepath.Join(s.BaseDir, "_runner"), outputFN, reporterC)
   317  }
   318  
   319  func runWait(ctx context.Context, script string, dir string, outputFN string, errorC chan *string) (err kv.Error) {
   320  
   321  	stopCmd, stopCmdCancel := context.WithCancel(context.Background())
   322  	// defers are stacked in LIFO order so cancelling this context is the last
   323  	// thing this function will do
   324  	defer stopCmdCancel()
   325  
   326  	// Move to starting the process that we will monitor with the experiment running within
   327  	// it
   328  
   329  	// #nosec
   330  	cmd := exec.Command("/bin/bash", "-c", filepath.Clean(script))
   331  	cmd.Dir = dir
   332  
   333  	stdout, errGo := cmd.StdoutPipe()
   334  	if errGo != nil {
   335  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   336  	}
   337  	stderr, errGo := cmd.StderrPipe()
   338  	if errGo != nil {
   339  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   340  	}
   341  
   342  	outC := make(chan []byte)
   343  	defer close(outC)
   344  	errC := make(chan string)
   345  	defer close(errC)
   346  
   347  	f, errGo := os.Create(outputFN)
   348  	if errGo != nil {
   349  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()).With("outputFN", outputFN)
   350  	}
   351  
   352  	stopCopy := make(chan struct{}, 1)
   353  	go procOutput(stopCopy, f, outC, errC)
   354  
   355  	if errGo = cmd.Start(); err != nil {
   356  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   357  	}
   358  
   359  	waitOnIO := sync.WaitGroup{}
   360  	waitOnIO.Add(2)
   361  
   362  	go func() {
   363  		defer waitOnIO.Done()
   364  		time.Sleep(time.Second)
   365  		s := bufio.NewScanner(stdout)
   366  		s.Split(bufio.ScanRunes)
   367  		for s.Scan() {
   368  			outC <- s.Bytes()
   369  		}
   370  		if errGo := s.Err(); errGo != nil {
   371  			if err != nil {
   372  				err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   373  			}
   374  		}
   375  	}()
   376  
   377  	go func() {
   378  		defer waitOnIO.Done()
   379  		time.Sleep(time.Second)
   380  		s := bufio.NewScanner(stderr)
   381  		s.Split(bufio.ScanLines)
   382  		for s.Scan() {
   383  			errC <- s.Text()
   384  		}
   385  		if errGo := s.Err(); errGo != nil {
   386  			if err != nil {
   387  				err = kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   388  			}
   389  		}
   390  	}()
   391  
   392  	go func() {
   393  		for {
   394  			select {
   395  			case <-ctx.Done():
   396  				if errGo := cmd.Process.Kill(); errGo != nil {
   397  					msg := fmt.Sprintf("could not be killed, maximum life time reached, due to %v", errGo)
   398  					select {
   399  					case errorC <- &msg:
   400  					default:
   401  					}
   402  					return
   403  				}
   404  				msg := "killed, maximum life time reached"
   405  				select {
   406  				case errorC <- &msg:
   407  				default:
   408  				}
   409  				return
   410  			case <-stopCmd.Done():
   411  				return
   412  			}
   413  		}
   414  	}()
   415  
   416  	if errGo = cmd.Wait(); err != nil {
   417  		return kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime())
   418  	}
   419  
   420  	waitOnIO.Wait()
   421  	close(stopCopy)
   422  
   423  	if err == nil && ctx.Err() != nil {
   424  		err = kv.Wrap(ctx.Err()).With("stack", stack.Trace().TrimRuntime())
   425  	}
   426  
   427  	return err
   428  }
   429  
   430  // Close is a stub method for termination of a singularity resource
   431  func (*Singularity) Close() (err kv.Error) {
   432  	return nil
   433  }