gorgonia.org/gorgonia@v0.9.17/cuda/external.go (about) 1 package cuda 2 3 import ( 4 "runtime" 5 6 "github.com/pkg/errors" 7 "gorgonia.org/cu" 8 "gorgonia.org/cu/blas" 9 "gorgonia.org/cu/dnn" 10 ) 11 12 // this file implements all the methods required to fulfil the External interface 13 14 var _ External = &Engine{} 15 16 const ( 17 // Any address of a variable residing in global memory or returned by one of the 18 // memory allocation routines from the driver or runtime API is always aligned to at 19 // least 256 bytes. 20 // 21 memalign = 32 22 scalarAlign = 8 23 ) 24 25 // HasFunc returns true if the execution is external (cgo/cuda/openCL) AND the external device contains the function with the given name 26 func (e *Engine) HasFunc(name string) bool { _, ok := e.f[name]; return ok } 27 28 // Sync returns a channel of sync signals 29 func (e *Engine) Sync() chan struct{} { return e.syncChan } 30 31 // Signal signals the machine to do work 32 func (e *Engine) Signal() { 33 e.workAvailable <- true 34 } 35 36 // Context returns the BatchedContext 37 func (e *Engine) Context() *cu.BatchedContext { return &e.c } 38 39 // CUDNNContext returns the cuDNN context 40 func (e *Engine) CUDNNContext() *cudnn.Context { return &e.n } 41 42 // BLASContext returns the cuBLAS context 43 func (e *Engine) BLASContext() *cublas.Standard { return &e.b } 44 45 // Modules returns the loaded modules indexed by name 46 func (e *Engine) Modules() map[string]cu.Module { return e.m } 47 48 // Functions returns the loaded functions indexed by name 49 func (e *Engine) Functions() map[string]cu.Function { return e.f } 50 51 // ElemGridSize calculates the gridsize for elementwise operations. n is the number of elements 52 func (e *Engine) ElemGridSize(n int) (gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ int) { 53 maxThreads := e.mtpb 54 maxGridX := e.mgdx 55 maxGridY := e.mgdy 56 maxGridZ := e.mgdz 57 58 blockDimX = 1 59 blockDimY = 1 60 blockDimZ = 1 61 gridDimX = 1 62 gridDimY = 1 63 gridDimZ = 1 64 65 blocks := calcBlocks(n, maxThreads) 66 switch { 67 case blocks == 1: 68 blockDimX = n 69 case blocks >= maxGridX*maxGridY*maxGridZ: 70 // what kind of monstrosity is this??! 71 case blocks >= maxGridX*maxGridY: 72 gridDimX = maxGridX 73 gridDimY = maxGridY 74 gridDimZ = calcBlocks(blocks%(maxGridX*maxGridY), maxGridZ) 75 blockDimX = maxThreads 76 case blocks >= maxGridX: 77 gridDimX = maxGridX 78 gridDimY = calcBlocks(blocks%(maxGridX), maxGridY) 79 blockDimX = maxThreads 80 default: 81 gridDimX = blocks 82 blockDimX = maxThreads 83 } 84 return 85 } 86 87 // Init creates a CUDA engine with the given size for the given device 88 func (e *Engine) Init(device cu.Device, size int64) (err error) { 89 e.Lock() 90 initialized := e.initialized 91 e.Unlock() 92 93 if initialized { 94 return nil 95 } 96 97 e.Lock() 98 e.d = device 99 if err = e.doInit(size); err != nil { 100 e.Unlock() 101 err2 := e.Close() 102 if err2 != nil { 103 return errors.Wrapf(err, "Failed to initialize CUDA Engine with size %d for device %v. Additionally, there were errors that occurred when cleaning up %v", size, device, err) 104 } 105 return errors.Wrapf(err, "Failed to initialize CUDA Engine with size %d for device %v", size, device) 106 } 107 e.initialized = true 108 e.Unlock() 109 return 110 } 111 112 func (e *Engine) doInit(size int64) (err error) { 113 e.workAvailable = make(chan bool) 114 e.syncChan = make(chan struct{}) 115 e.finishChan = make(chan struct{}) 116 e.finishChan2 = make(chan struct{}, 1) 117 e.a = makeBFC(memalign) 118 119 // create and set context 120 var cuctx cu.CUContext 121 ctxFlag := cu.SchedAuto 122 if cuctx, err = e.d.MakeContext(ctxFlag); err != nil { 123 if err == cu.OutOfMemory { 124 free, total, err2 := cu.MemInfo() 125 if err2 != nil { 126 return errors.Wrapf(err, "Out of memory. Additionally errors were found while retrieving mem info %v", err2) 127 } 128 return errors.Wrapf(err, "Out of memory. Free: %v, total %v | %v", free, total, cuctx) 129 } 130 return errors.Wrapf(err, "Failed to make context for device %d", e.d) 131 } 132 e.c = *(cu.NewBatchedContext(cu.CtxFromCUContext(e.d, cuctx, ctxFlag), e.d)) 133 134 var attrs []int 135 if attrs, err = e.d.Attributes(cu.WarpSize, cu.MaxThreadsPerBlock, cu.MaxGridDimX, cu.MaxGridDimY, cu.MaxGridDimZ, cu.MaxBlockDimX, cu.MaxBlockDimY, cu.MaxBlockDimZ); err != nil { 136 return errors.Wrapf(err, "Failed to get attributes for device %v.", e.d) 137 } 138 139 e.warp = attrs[0] 140 e.mtpb = attrs[1] 141 e.mgdx = attrs[2] 142 e.mgdy = attrs[3] 143 e.mgdz = attrs[4] 144 e.mbdx = attrs[5] 145 e.mbdy = attrs[6] 146 e.mbdz = attrs[7] 147 148 e.m = make(map[string]cu.Module) 149 e.f = make(map[string]cu.Function) 150 151 // actual work to allocate from graphics card 152 153 if e.freeMem, e.totalMem, err = cu.MemInfo(); err != nil { 154 return errors.Wrapf(err, "Failed to get free and total mem for device %v", e.d) 155 } 156 157 // actually reserve memory for the allocator 158 var allocsize int64 = 2*size + (size / 2) + minAllocSize 159 if allocsize >= e.freeMem { 160 return errors.Errorf("Unable to get %v bytes. Free memory available %v", allocsize, e.freeMem) 161 } 162 ptr, err := cu.MemAllocManaged(allocsize, cu.AttachGlobal) 163 if err != nil { 164 return errors.Wrapf(err, "Failed to allocate %v bytes of managed memory for %v", allocsize, e.d) 165 } 166 e.a.reserve(uintptr(ptr), allocsize) 167 e.n = *(cudnn.NewContext()) 168 go e.Run() 169 return nil 170 } 171 172 // Close cleans up the machine, and closes all available resources 173 func (e *Engine) Close() error { 174 e.Lock() 175 defer e.Unlock() 176 e.c.Cleanup() // frees all ancillary allocations in C land 177 if e.c.Context == nil { 178 return nil 179 } 180 cu.SetCurrentContext(e.c.Context.CUDAContext()) 181 182 // Unload all modules (and consequently all functions) 183 for name, mod := range e.m { 184 if err := mod.Unload(); err != nil { 185 return errors.Wrapf(err, "Failed to unload module %v", name) 186 } 187 } 188 189 // Free all CUDA memory 190 if e.a.start != 0 { 191 cu.MemFree(cu.DevicePtr(e.a.start)) 192 } 193 e.a.reset() 194 195 closeB := func() error { return e.b.Close() } 196 197 if err := e.c.Do(closeB); err != nil { 198 return errors.Wrap(e.err, "Failed to close cuBLAS context") 199 } 200 201 closeN := func() error { return e.n.Close() } 202 203 if err := e.c.Do(closeN); err != nil { 204 return errors.Wrap(e.err, "Failed to close cuDNN context") 205 } 206 207 if e.workAvailable != nil { 208 close(e.workAvailable) 209 } 210 211 if err := e.c.Close(); err != nil { 212 return errors.Wrapf(err, "Failed to cloes CUDA Context ") 213 } 214 215 runtime.Gosched() // make sure everyone has a fair play 216 e.finishChan <- struct{}{} 217 e.finishChan2 <- struct{}{} // wait 218 e.initialized = false 219 return nil 220 } 221 222 // DoWork sends a signal to the batched CUDA Context to actually do work 223 func (e *Engine) DoWork() error { 224 e.c.DoWork() 225 return e.c.Errors() 226 } 227 228 // Run initialises and starts the engine 229 func (e *Engine) Run() { 230 e.Lock() 231 if e.running { 232 e.Unlock() 233 return 234 } 235 e.Unlock() 236 237 runtime.LockOSThread() 238 defer runtime.UnlockOSThread() 239 240 // finish initialization 241 e.b.Init(cublas.WithContext(&e.c)) 242 243 // finishChan2 blocks any external commands to engine (like Close) until it's ready to finish. 244 e.finishChan2 <- struct{}{} 245 246 loop: 247 for { 248 select { 249 case <-e.c.WorkAvailable(): 250 e.c.DoWork() 251 if err := e.c.Errors(); err != nil { 252 e.Lock() 253 e.err = err 254 e.running = false 255 e.Unlock() 256 break loop 257 } 258 case w := <-e.c.Work(): 259 if w != nil { 260 err := w() 261 e.c.ErrChan() <- err 262 263 if err != nil { 264 e.Lock() 265 e.err = err 266 e.running = false 267 e.Unlock() 268 break loop 269 } 270 } 271 case <-e.finishChan: 272 break loop 273 } 274 } 275 <-e.finishChan2 276 } 277 278 // blockThread is an easier version of calculating <<threads, blocks>> for CUDA. Useful for debugging 279 func (e *Engine) blockThread(n, dev int) (blocks, threads int) { 280 switch { 281 case n <= 32: 282 threads = 32 283 case n <= 64: 284 threads = 64 285 case n <= 128: 286 threads = 128 287 case n <= 256: 288 threads = 256 289 case n <= 512: 290 threads = 512 291 default: 292 threads = 1024 293 } 294 295 blocks = (n + threads - 1) / threads 296 if blocks < 0 || blocks > 128 { 297 blocks = 128 298 } 299 return 300 } 301 302 // it's just a generic ceiling function. Added here to avoid mixing with any potential ceilInt operation 303 func calcBlocks(n, maxThreads int) int { return (n + maxThreads - 1) / maxThreads }