gorgonia.org/gorgonia@v0.9.17/cuda/external.go (about)

     1  package cuda
     2  
     3  import (
     4  	"runtime"
     5  
     6  	"github.com/pkg/errors"
     7  	"gorgonia.org/cu"
     8  	"gorgonia.org/cu/blas"
     9  	"gorgonia.org/cu/dnn"
    10  )
    11  
    12  //  this file implements all the methods required to fulfil the External interface
    13  
    14  var _ External = &Engine{}
    15  
    16  const (
    17  	// Any address of a variable residing in global memory or returned by one of the
    18  	// memory allocation routines from the driver or runtime API is always aligned to at
    19  	// least 256 bytes.
    20  	//
    21  	memalign    = 32
    22  	scalarAlign = 8
    23  )
    24  
    25  // HasFunc returns true if the execution is external (cgo/cuda/openCL) AND the external device contains the function with the given name
    26  func (e *Engine) HasFunc(name string) bool { _, ok := e.f[name]; return ok }
    27  
    28  // Sync returns a channel of sync signals
    29  func (e *Engine) Sync() chan struct{} { return e.syncChan }
    30  
    31  // Signal signals the machine to do work
    32  func (e *Engine) Signal() {
    33  	e.workAvailable <- true
    34  }
    35  
    36  // Context returns the BatchedContext
    37  func (e *Engine) Context() *cu.BatchedContext { return &e.c }
    38  
    39  // CUDNNContext returns the cuDNN context
    40  func (e *Engine) CUDNNContext() *cudnn.Context { return &e.n }
    41  
    42  // BLASContext returns the cuBLAS context
    43  func (e *Engine) BLASContext() *cublas.Standard { return &e.b }
    44  
    45  // Modules returns the loaded modules indexed by name
    46  func (e *Engine) Modules() map[string]cu.Module { return e.m }
    47  
    48  // Functions returns the loaded functions indexed by name
    49  func (e *Engine) Functions() map[string]cu.Function { return e.f }
    50  
    51  // ElemGridSize calculates the gridsize for elementwise operations. n is the number of elements
    52  func (e *Engine) ElemGridSize(n int) (gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ int) {
    53  	maxThreads := e.mtpb
    54  	maxGridX := e.mgdx
    55  	maxGridY := e.mgdy
    56  	maxGridZ := e.mgdz
    57  
    58  	blockDimX = 1
    59  	blockDimY = 1
    60  	blockDimZ = 1
    61  	gridDimX = 1
    62  	gridDimY = 1
    63  	gridDimZ = 1
    64  
    65  	blocks := calcBlocks(n, maxThreads)
    66  	switch {
    67  	case blocks == 1:
    68  		blockDimX = n
    69  	case blocks >= maxGridX*maxGridY*maxGridZ:
    70  		// what kind of monstrosity is this??!
    71  	case blocks >= maxGridX*maxGridY:
    72  		gridDimX = maxGridX
    73  		gridDimY = maxGridY
    74  		gridDimZ = calcBlocks(blocks%(maxGridX*maxGridY), maxGridZ)
    75  		blockDimX = maxThreads
    76  	case blocks >= maxGridX:
    77  		gridDimX = maxGridX
    78  		gridDimY = calcBlocks(blocks%(maxGridX), maxGridY)
    79  		blockDimX = maxThreads
    80  	default:
    81  		gridDimX = blocks
    82  		blockDimX = maxThreads
    83  	}
    84  	return
    85  }
    86  
    87  // Init creates a CUDA engine with the given size for the given device
    88  func (e *Engine) Init(device cu.Device, size int64) (err error) {
    89  	e.Lock()
    90  	initialized := e.initialized
    91  	e.Unlock()
    92  
    93  	if initialized {
    94  		return nil
    95  	}
    96  
    97  	e.Lock()
    98  	e.d = device
    99  	if err = e.doInit(size); err != nil {
   100  		e.Unlock()
   101  		err2 := e.Close()
   102  		if err2 != nil {
   103  			return errors.Wrapf(err, "Failed to initialize CUDA Engine with size %d for device %v. Additionally, there were errors that occurred when cleaning up %v", size, device, err)
   104  		}
   105  		return errors.Wrapf(err, "Failed to initialize CUDA Engine with size %d for device %v", size, device)
   106  	}
   107  	e.initialized = true
   108  	e.Unlock()
   109  	return
   110  }
   111  
   112  func (e *Engine) doInit(size int64) (err error) {
   113  	e.workAvailable = make(chan bool)
   114  	e.syncChan = make(chan struct{})
   115  	e.finishChan = make(chan struct{})
   116  	e.finishChan2 = make(chan struct{}, 1)
   117  	e.a = makeBFC(memalign)
   118  
   119  	// create and set context
   120  	var cuctx cu.CUContext
   121  	ctxFlag := cu.SchedAuto
   122  	if cuctx, err = e.d.MakeContext(ctxFlag); err != nil {
   123  		if err == cu.OutOfMemory {
   124  			free, total, err2 := cu.MemInfo()
   125  			if err2 != nil {
   126  				return errors.Wrapf(err, "Out of memory. Additionally errors were found while retrieving mem info %v", err2)
   127  			}
   128  			return errors.Wrapf(err, "Out of memory. Free: %v, total %v | %v", free, total, cuctx)
   129  		}
   130  		return errors.Wrapf(err, "Failed to make context for device %d", e.d)
   131  	}
   132  	e.c = *(cu.NewBatchedContext(cu.CtxFromCUContext(e.d, cuctx, ctxFlag), e.d))
   133  
   134  	var attrs []int
   135  	if attrs, err = e.d.Attributes(cu.WarpSize, cu.MaxThreadsPerBlock, cu.MaxGridDimX, cu.MaxGridDimY, cu.MaxGridDimZ, cu.MaxBlockDimX, cu.MaxBlockDimY, cu.MaxBlockDimZ); err != nil {
   136  		return errors.Wrapf(err, "Failed to get attributes for device %v.", e.d)
   137  	}
   138  
   139  	e.warp = attrs[0]
   140  	e.mtpb = attrs[1]
   141  	e.mgdx = attrs[2]
   142  	e.mgdy = attrs[3]
   143  	e.mgdz = attrs[4]
   144  	e.mbdx = attrs[5]
   145  	e.mbdy = attrs[6]
   146  	e.mbdz = attrs[7]
   147  
   148  	e.m = make(map[string]cu.Module)
   149  	e.f = make(map[string]cu.Function)
   150  
   151  	// actual work to allocate from graphics card
   152  
   153  	if e.freeMem, e.totalMem, err = cu.MemInfo(); err != nil {
   154  		return errors.Wrapf(err, "Failed to get free and total mem for device %v", e.d)
   155  	}
   156  
   157  	// actually reserve memory for the allocator
   158  	var allocsize int64 = 2*size + (size / 2) + minAllocSize
   159  	if allocsize >= e.freeMem {
   160  		return errors.Errorf("Unable to get %v bytes. Free memory available %v", allocsize, e.freeMem)
   161  	}
   162  	ptr, err := cu.MemAllocManaged(allocsize, cu.AttachGlobal)
   163  	if err != nil {
   164  		return errors.Wrapf(err, "Failed to allocate %v bytes of managed memory for %v", allocsize, e.d)
   165  	}
   166  	e.a.reserve(uintptr(ptr), allocsize)
   167  	e.n = *(cudnn.NewContext())
   168  	go e.Run()
   169  	return nil
   170  }
   171  
   172  // Close cleans up the machine, and closes all available resources
   173  func (e *Engine) Close() error {
   174  	e.Lock()
   175  	defer e.Unlock()
   176  	e.c.Cleanup() // frees all ancillary allocations in C land
   177  	if e.c.Context == nil {
   178  		return nil
   179  	}
   180  	cu.SetCurrentContext(e.c.Context.CUDAContext())
   181  
   182  	// Unload all modules (and consequently all functions)
   183  	for name, mod := range e.m {
   184  		if err := mod.Unload(); err != nil {
   185  			return errors.Wrapf(err, "Failed to unload module %v", name)
   186  		}
   187  	}
   188  
   189  	// Free all CUDA memory
   190  	if e.a.start != 0 {
   191  		cu.MemFree(cu.DevicePtr(e.a.start))
   192  	}
   193  	e.a.reset()
   194  
   195  	closeB := func() error { return e.b.Close() }
   196  
   197  	if err := e.c.Do(closeB); err != nil {
   198  		return errors.Wrap(e.err, "Failed to close cuBLAS context")
   199  	}
   200  
   201  	closeN := func() error { return e.n.Close() }
   202  
   203  	if err := e.c.Do(closeN); err != nil {
   204  		return errors.Wrap(e.err, "Failed to close cuDNN context")
   205  	}
   206  
   207  	if e.workAvailable != nil {
   208  		close(e.workAvailable)
   209  	}
   210  
   211  	if err := e.c.Close(); err != nil {
   212  		return errors.Wrapf(err, "Failed to cloes CUDA Context ")
   213  	}
   214  
   215  	runtime.Gosched() // make sure everyone has a fair play
   216  	e.finishChan <- struct{}{}
   217  	e.finishChan2 <- struct{}{} // wait
   218  	e.initialized = false
   219  	return nil
   220  }
   221  
   222  // DoWork sends a signal to the batched CUDA Context to actually do work
   223  func (e *Engine) DoWork() error {
   224  	e.c.DoWork()
   225  	return e.c.Errors()
   226  }
   227  
   228  // Run initialises and starts the engine
   229  func (e *Engine) Run() {
   230  	e.Lock()
   231  	if e.running {
   232  		e.Unlock()
   233  		return
   234  	}
   235  	e.Unlock()
   236  
   237  	runtime.LockOSThread()
   238  	defer runtime.UnlockOSThread()
   239  
   240  	// finish initialization
   241  	e.b.Init(cublas.WithContext(&e.c))
   242  
   243  	// finishChan2 blocks any external commands to engine (like Close) until it's ready to finish.
   244  	e.finishChan2 <- struct{}{}
   245  
   246  loop:
   247  	for {
   248  		select {
   249  		case <-e.c.WorkAvailable():
   250  			e.c.DoWork()
   251  			if err := e.c.Errors(); err != nil {
   252  				e.Lock()
   253  				e.err = err
   254  				e.running = false
   255  				e.Unlock()
   256  				break loop
   257  			}
   258  		case w := <-e.c.Work():
   259  			if w != nil {
   260  				err := w()
   261  				e.c.ErrChan() <- err
   262  
   263  				if err != nil {
   264  					e.Lock()
   265  					e.err = err
   266  					e.running = false
   267  					e.Unlock()
   268  					break loop
   269  				}
   270  			}
   271  		case <-e.finishChan:
   272  			break loop
   273  		}
   274  	}
   275  	<-e.finishChan2
   276  }
   277  
   278  // blockThread is an easier version of calculating <<threads, blocks>> for CUDA. Useful for debugging
   279  func (e *Engine) blockThread(n, dev int) (blocks, threads int) {
   280  	switch {
   281  	case n <= 32:
   282  		threads = 32
   283  	case n <= 64:
   284  		threads = 64
   285  	case n <= 128:
   286  		threads = 128
   287  	case n <= 256:
   288  		threads = 256
   289  	case n <= 512:
   290  		threads = 512
   291  	default:
   292  		threads = 1024
   293  	}
   294  
   295  	blocks = (n + threads - 1) / threads
   296  	if blocks < 0 || blocks > 128 {
   297  		blocks = 128
   298  	}
   299  	return
   300  }
   301  
   302  // it's just a generic ceiling function. Added here to avoid mixing with any potential ceilInt operation
   303  func calcBlocks(n, maxThreads int) int { return (n + maxThreads - 1) / maxThreads }