github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/memutils/memory/rmm_alloc.cu (about) 1 // Copyright (c) 2017-2018 Uber Technologies, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include <cuda_runtime.h> 16 #include <cuda_profiler_api.h> 17 #include <rmm/rmm.h> 18 #include <rmm/rmm_api.h> 19 #include <cstdio> 20 #include <cstring> 21 22 #include "../memory.h" 23 24 const int MAX_ERROR_LEN = 100; 25 26 // checkCUDAError checks the cuda error of last runtime calls and returns the 27 // pointer to the buffer of error message. This buffer needs to be released 28 // by caller or upper callers. 29 char *checkCUDAError(const char *message) { 30 cudaError_t error = cudaGetLastError(); 31 if (error != cudaSuccess) { 32 char *buffer = reinterpret_cast<char *>(malloc(MAX_ERROR_LEN)); 33 snprintf(buffer, MAX_ERROR_LEN, 34 "ERROR when calling CUDA functions from host: %s: %s\n", 35 message, cudaGetErrorString(error)); 36 return buffer; 37 } 38 return NULL; 39 } 40 41 char *checkRMMError(rmmError_t rmmError, const char* message) { 42 if (rmmError != RMM_SUCCESS) { 43 char *buffer = reinterpret_cast<char *>(malloc(MAX_ERROR_LEN)); 44 snprintf(buffer, MAX_ERROR_LEN, 45 "ERROR when calling RMM functions: %s: %s\n", 46 message, rmmGetErrorString(rmmError)); 47 return buffer; 48 } 49 return NULL; 50 } 51 52 DeviceMemoryFlags GetFlags() { 53 return DEVICE_MEMORY_IMPLEMENTATION_FLAG | POOLED_MEMORY_FLAG; 54 } 55 56 CGoCallResHandle Init() { 57 CGoCallResHandle resHandle = GetDeviceCount(); 58 if (resHandle.pStrErr != nullptr) { 59 return resHandle; 60 } 61 62 size_t deviceCount = reinterpret_cast<size_t>(resHandle.res); 63 for (size_t device = 0; device < deviceCount; device++) { 64 cudaSetDevice(device); 65 rmmOptions_t options = { 66 PoolAllocation, 67 0, // Default to half ot total memory 68 false // Disable logging. 69 }; 70 resHandle.pStrErr = checkRMMError(rmmInitialize(&options), "rmmInitialize"); 71 if (resHandle.pStrErr != nullptr) { 72 return resHandle; 73 } 74 } 75 return resHandle; 76 } 77 78 CGoCallResHandle DeviceAllocate(size_t bytes, int device) { 79 CGoCallResHandle resHandle = {NULL, NULL}; 80 cudaSetDevice(device); 81 // For now use default stream to avoid changing the memory allocation 82 // interface. 83 // TODO(lucafuji): use the stream of current execution pipeline for 84 // allocation and free. 85 resHandle.pStrErr = checkRMMError(RMM_ALLOC(&resHandle.res, bytes, 0), 86 "DeviceAllocate"); 87 if (resHandle.pStrErr == nullptr) { 88 cudaMemset(resHandle.res, 0, bytes); 89 resHandle.pStrErr = checkCUDAError("DeviceAllocate"); 90 } 91 return resHandle; 92 } 93 94 CGoCallResHandle DeviceFree(void *p, int device) { 95 CGoCallResHandle resHandle = {NULL, NULL}; 96 cudaSetDevice(device); 97 // For now use default stream to avoid changing the memory allocation 98 // interface. 99 // TODO(lucafuji): use the stream of current execution pipeline for 100 // allocation and free. 101 resHandle.pStrErr = checkRMMError(RMM_FREE(p, 0), 102 "DeviceFree"); 103 return resHandle; 104 } 105 106 // All following function implementation is the same as cuda_malloc.cu. 107 // We might remove cuda_malloc.cu file after RMM is proven to be working 108 // in production environment 109 110 CGoCallResHandle HostAlloc(size_t bytes) { 111 CGoCallResHandle resHandle = {NULL, NULL}; 112 // cudaHostAllocPortable makes sure that the allocation is associated with all 113 // devices, not just the current device. 114 cudaHostAlloc(&resHandle.res, bytes, cudaHostAllocPortable); 115 memset(resHandle.res, 0, bytes); 116 resHandle.pStrErr = checkCUDAError("Allocate"); 117 return resHandle; 118 } 119 120 CGoCallResHandle HostFree(void *p) { 121 CGoCallResHandle resHandle = {NULL, NULL}; 122 cudaFreeHost(p); 123 resHandle.pStrErr = checkCUDAError("Free"); 124 return resHandle; 125 } 126 127 CGoCallResHandle CreateCudaStream(int device) { 128 CGoCallResHandle resHandle = {NULL, NULL}; 129 cudaSetDevice(device); 130 cudaStream_t s = NULL; 131 cudaStreamCreate(&s); 132 resHandle.res = reinterpret_cast<void *>(s); 133 resHandle.pStrErr = checkCUDAError("CreateCudaStream"); 134 return resHandle; 135 } 136 137 CGoCallResHandle WaitForCudaStream(void *s, int device) { 138 CGoCallResHandle resHandle = {NULL, NULL}; 139 cudaSetDevice(device); 140 cudaStreamSynchronize((cudaStream_t) s); 141 resHandle.pStrErr = checkCUDAError("WaitForCudaStream"); 142 return resHandle; 143 } 144 145 CGoCallResHandle DestroyCudaStream(void *s, int device) { 146 CGoCallResHandle resHandle = {NULL, NULL}; 147 cudaSetDevice(device); 148 cudaStreamDestroy((cudaStream_t) s); 149 resHandle.pStrErr = checkCUDAError("DestroyCudaStream"); 150 return resHandle; 151 } 152 153 CGoCallResHandle AsyncCopyHostToDevice( 154 void *dst, void *src, size_t bytes, void *stream, int device) { 155 CGoCallResHandle resHandle = {NULL, NULL}; 156 cudaSetDevice(device); 157 cudaMemcpyAsync(dst, src, bytes, 158 cudaMemcpyHostToDevice, (cudaStream_t) stream); 159 resHandle.pStrErr = checkCUDAError("AsyncCopyHostToDevice"); 160 return resHandle; 161 } 162 163 CGoCallResHandle AsyncCopyDeviceToDevice( 164 void *dst, void *src, size_t bytes, void *stream, int device) { 165 CGoCallResHandle resHandle = {NULL, NULL}; 166 cudaSetDevice(device); 167 cudaMemcpyAsync(dst, src, bytes, 168 cudaMemcpyDeviceToDevice, (cudaStream_t) stream); 169 resHandle.pStrErr = checkCUDAError("AsyncCopyDeviceToDevice"); 170 return resHandle; 171 } 172 173 CGoCallResHandle AsyncCopyDeviceToHost( 174 void *dst, void *src, size_t bytes, void *stream, int device) { 175 CGoCallResHandle resHandle = {NULL, NULL}; 176 cudaSetDevice(device); 177 cudaMemcpyAsync(dst, src, bytes, 178 cudaMemcpyDeviceToHost, (cudaStream_t) stream); 179 resHandle.pStrErr = checkCUDAError("AsyncCopyDeviceToHost"); 180 return resHandle; 181 } 182 183 CGoCallResHandle GetDeviceCount() { 184 CGoCallResHandle resHandle = {NULL, NULL}; 185 cudaGetDeviceCount(reinterpret_cast<int *>(&resHandle.res)); 186 resHandle.pStrErr = checkCUDAError("GetDeviceCount"); 187 return resHandle; 188 } 189 190 CGoCallResHandle GetDeviceGlobalMemoryInMB(int device) { 191 CGoCallResHandle resHandle = {NULL, NULL}; 192 cudaDeviceProp prop; 193 cudaGetDeviceProperties(&prop, device); 194 resHandle.res = reinterpret_cast<void *>(prop.totalGlobalMem / (1024 * 1024)); 195 resHandle.pStrErr = checkCUDAError("GetDeviceGlobalMemoryInMB"); 196 return resHandle; 197 } 198 199 CGoCallResHandle CudaProfilerStart() { 200 CGoCallResHandle resHandle = {NULL, NULL}; 201 cudaProfilerStart(); 202 resHandle.pStrErr = checkCUDAError("cudaProfilerStart"); 203 return resHandle; 204 } 205 206 CGoCallResHandle CudaProfilerStop() { 207 CGoCallResHandle resHandle = {NULL, NULL}; 208 cudaDeviceSynchronize(); 209 cudaProfilerStop(); 210 resHandle.pStrErr = checkCUDAError("cudaProfilerStop"); 211 return resHandle; 212 } 213 214 CGoCallResHandle GetDeviceMemoryInfo(size_t *freeSize, size_t *totalSize, 215 int device) { 216 CGoCallResHandle resHandle = {NULL, NULL}; 217 cudaSetDevice(device); 218 resHandle.pStrErr = checkRMMError(rmmGetInfo(freeSize, totalSize, 0), 219 "GetDeviceMemoryInfo"); 220 return resHandle; 221 } 222 223 CGoCallResHandle deviceMalloc(void **devPtr, size_t size) { 224 CGoCallResHandle resHandle = {NULL, NULL}; 225 // For now use default stream to avoid changing the memory allocation 226 // interface. 227 // TODO(lucafuji): use the stream of current execution pipeline for 228 // allocation and free. 229 resHandle.pStrErr = checkRMMError(RMM_ALLOC(devPtr, size, 0), 230 "deviceMalloc"); 231 return resHandle; 232 } 233 234 CGoCallResHandle deviceFree(void *devPtr) { 235 CGoCallResHandle resHandle = {NULL, NULL}; 236 // For now use default stream to avoid changing the memory allocation 237 // interface. 238 // TODO(lucafuji): use the stream of current execution pipeline for 239 // allocation and free. 240 resHandle.pStrErr = checkRMMError(RMM_FREE(devPtr, 0), "deviceFree"); 241 return resHandle; 242 } 243 244 CGoCallResHandle deviceMemset(void *devPtr, int value, size_t count) { 245 CGoCallResHandle resHandle = {NULL, NULL}; 246 cudaMemset(devPtr, value, count); 247 resHandle.pStrErr = checkCUDAError("deviceMemset"); 248 return resHandle; 249 } 250 251 CGoCallResHandle asyncCopyHostToDevice(void* dst, const void* src, 252 size_t count, void* stream) { 253 CGoCallResHandle resHandle = {NULL, NULL}; 254 cudaMemcpyAsync(dst, src, count, 255 cudaMemcpyHostToDevice, (cudaStream_t) stream); 256 resHandle.pStrErr = checkCUDAError("asyncCopyHostToDevice"); 257 return resHandle; 258 }