github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/memutils/memory/rmm_alloc.cu

github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/memutils/memory/rmm_alloc.cu (about)

     1  //  Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include <cuda_runtime.h>
    16  #include <cuda_profiler_api.h>
    17  #include <rmm/rmm.h>
    18  #include <rmm/rmm_api.h>
    19  #include <cstdio>
    20  #include <cstring>
    21  
    22  #include "../memory.h"
    23  
    24  const int MAX_ERROR_LEN = 100;
    25  
    26  // checkCUDAError checks the cuda error of last runtime calls and returns the
    27  // pointer to the buffer of error message. This buffer needs to be released
    28  // by caller or upper callers.
    29  char *checkCUDAError(const char *message) {
    30    cudaError_t error = cudaGetLastError();
    31    if (error != cudaSuccess) {
    32      char *buffer = reinterpret_cast<char *>(malloc(MAX_ERROR_LEN));
    33      snprintf(buffer, MAX_ERROR_LEN,
    34               "ERROR when calling CUDA functions from host: %s: %s\n",
    35               message, cudaGetErrorString(error));
    36      return buffer;
    37    }
    38    return NULL;
    39  }
    40  
    41  char *checkRMMError(rmmError_t rmmError, const char* message) {
    42    if (rmmError != RMM_SUCCESS) {
    43      char *buffer = reinterpret_cast<char *>(malloc(MAX_ERROR_LEN));
    44      snprintf(buffer, MAX_ERROR_LEN,
    45               "ERROR when calling RMM functions: %s: %s\n",
    46               message, rmmGetErrorString(rmmError));
    47      return buffer;
    48    }
    49    return NULL;
    50  }
    51  
    52  DeviceMemoryFlags GetFlags() {
    53    return DEVICE_MEMORY_IMPLEMENTATION_FLAG | POOLED_MEMORY_FLAG;
    54  }
    55  
    56  CGoCallResHandle Init() {
    57    CGoCallResHandle resHandle = GetDeviceCount();
    58    if (resHandle.pStrErr != nullptr) {
    59      return resHandle;
    60    }
    61  
    62    size_t deviceCount = reinterpret_cast<size_t>(resHandle.res);
    63    for (size_t device = 0; device < deviceCount; device++) {
    64      cudaSetDevice(device);
    65      rmmOptions_t options = {
    66          PoolAllocation,
    67          0,  // Default to half ot total memory
    68          false  // Disable logging.
    69      };
    70      resHandle.pStrErr = checkRMMError(rmmInitialize(&options), "rmmInitialize");
    71      if (resHandle.pStrErr != nullptr) {
    72        return resHandle;
    73      }
    74    }
    75    return resHandle;
    76  }
    77  
    78  CGoCallResHandle DeviceAllocate(size_t bytes, int device) {
    79    CGoCallResHandle resHandle = {NULL, NULL};
    80    cudaSetDevice(device);
    81    // For now use default stream to avoid changing the memory allocation
    82    // interface.
    83    // TODO(lucafuji): use the stream of current execution pipeline for
    84    // allocation and free.
    85    resHandle.pStrErr = checkRMMError(RMM_ALLOC(&resHandle.res, bytes, 0),
    86        "DeviceAllocate");
    87    if (resHandle.pStrErr == nullptr) {
    88      cudaMemset(resHandle.res, 0, bytes);
    89      resHandle.pStrErr = checkCUDAError("DeviceAllocate");
    90    }
    91    return resHandle;
    92  }
    93  
    94  CGoCallResHandle DeviceFree(void *p, int device) {
    95    CGoCallResHandle resHandle = {NULL, NULL};
    96    cudaSetDevice(device);
    97    // For now use default stream to avoid changing the memory allocation
    98    // interface.
    99    // TODO(lucafuji): use the stream of current execution pipeline for
   100    // allocation and free.
   101    resHandle.pStrErr = checkRMMError(RMM_FREE(p, 0),
   102        "DeviceFree");
   103    return resHandle;
   104  }
   105  
   106  // All following function implementation is the same as cuda_malloc.cu.
   107  // We might remove cuda_malloc.cu file after RMM is proven to be working
   108  // in production environment
   109  
   110  CGoCallResHandle HostAlloc(size_t bytes) {
   111    CGoCallResHandle resHandle = {NULL, NULL};
   112    // cudaHostAllocPortable makes sure that the allocation is associated with all
   113    // devices, not just the current device.
   114    cudaHostAlloc(&resHandle.res, bytes, cudaHostAllocPortable);
   115    memset(resHandle.res, 0, bytes);
   116    resHandle.pStrErr = checkCUDAError("Allocate");
   117    return resHandle;
   118  }
   119  
   120  CGoCallResHandle HostFree(void *p) {
   121    CGoCallResHandle resHandle = {NULL, NULL};
   122    cudaFreeHost(p);
   123    resHandle.pStrErr = checkCUDAError("Free");
   124    return resHandle;
   125  }
   126  
   127  CGoCallResHandle CreateCudaStream(int device) {
   128    CGoCallResHandle resHandle = {NULL, NULL};
   129    cudaSetDevice(device);
   130    cudaStream_t s = NULL;
   131    cudaStreamCreate(&s);
   132    resHandle.res = reinterpret_cast<void *>(s);
   133    resHandle.pStrErr = checkCUDAError("CreateCudaStream");
   134    return resHandle;
   135  }
   136  
   137  CGoCallResHandle WaitForCudaStream(void *s, int device) {
   138    CGoCallResHandle resHandle = {NULL, NULL};
   139    cudaSetDevice(device);
   140    cudaStreamSynchronize((cudaStream_t) s);
   141    resHandle.pStrErr = checkCUDAError("WaitForCudaStream");
   142    return resHandle;
   143  }
   144  
   145  CGoCallResHandle DestroyCudaStream(void *s, int device) {
   146    CGoCallResHandle resHandle = {NULL, NULL};
   147    cudaSetDevice(device);
   148    cudaStreamDestroy((cudaStream_t) s);
   149    resHandle.pStrErr = checkCUDAError("DestroyCudaStream");
   150    return resHandle;
   151  }
   152  
   153  CGoCallResHandle AsyncCopyHostToDevice(
   154      void *dst, void *src, size_t bytes, void *stream, int device) {
   155    CGoCallResHandle resHandle = {NULL, NULL};
   156    cudaSetDevice(device);
   157    cudaMemcpyAsync(dst, src, bytes,
   158                    cudaMemcpyHostToDevice, (cudaStream_t) stream);
   159    resHandle.pStrErr = checkCUDAError("AsyncCopyHostToDevice");
   160    return resHandle;
   161  }
   162  
   163  CGoCallResHandle AsyncCopyDeviceToDevice(
   164      void *dst, void *src, size_t bytes, void *stream, int device) {
   165    CGoCallResHandle resHandle = {NULL, NULL};
   166    cudaSetDevice(device);
   167    cudaMemcpyAsync(dst, src, bytes,
   168                    cudaMemcpyDeviceToDevice, (cudaStream_t) stream);
   169    resHandle.pStrErr = checkCUDAError("AsyncCopyDeviceToDevice");
   170    return resHandle;
   171  }
   172  
   173  CGoCallResHandle AsyncCopyDeviceToHost(
   174      void *dst, void *src, size_t bytes, void *stream, int device) {
   175    CGoCallResHandle resHandle = {NULL, NULL};
   176    cudaSetDevice(device);
   177    cudaMemcpyAsync(dst, src, bytes,
   178                    cudaMemcpyDeviceToHost, (cudaStream_t) stream);
   179    resHandle.pStrErr = checkCUDAError("AsyncCopyDeviceToHost");
   180    return resHandle;
   181  }
   182  
   183  CGoCallResHandle GetDeviceCount() {
   184    CGoCallResHandle resHandle = {NULL, NULL};
   185    cudaGetDeviceCount(reinterpret_cast<int *>(&resHandle.res));
   186    resHandle.pStrErr = checkCUDAError("GetDeviceCount");
   187    return resHandle;
   188  }
   189  
   190  CGoCallResHandle GetDeviceGlobalMemoryInMB(int device) {
   191    CGoCallResHandle resHandle = {NULL, NULL};
   192    cudaDeviceProp prop;
   193    cudaGetDeviceProperties(&prop, device);
   194    resHandle.res = reinterpret_cast<void *>(prop.totalGlobalMem / (1024 * 1024));
   195    resHandle.pStrErr = checkCUDAError("GetDeviceGlobalMemoryInMB");
   196    return resHandle;
   197  }
   198  
   199  CGoCallResHandle CudaProfilerStart() {
   200    CGoCallResHandle resHandle = {NULL, NULL};
   201    cudaProfilerStart();
   202    resHandle.pStrErr = checkCUDAError("cudaProfilerStart");
   203    return resHandle;
   204  }
   205  
   206  CGoCallResHandle CudaProfilerStop() {
   207    CGoCallResHandle resHandle = {NULL, NULL};
   208    cudaDeviceSynchronize();
   209    cudaProfilerStop();
   210    resHandle.pStrErr = checkCUDAError("cudaProfilerStop");
   211    return resHandle;
   212  }
   213  
   214  CGoCallResHandle GetDeviceMemoryInfo(size_t *freeSize, size_t *totalSize,
   215                                       int device) {
   216    CGoCallResHandle resHandle = {NULL, NULL};
   217    cudaSetDevice(device);
   218    resHandle.pStrErr = checkRMMError(rmmGetInfo(freeSize, totalSize, 0),
   219        "GetDeviceMemoryInfo");
   220    return resHandle;
   221  }
   222  
   223  CGoCallResHandle deviceMalloc(void **devPtr, size_t size) {
   224    CGoCallResHandle resHandle = {NULL, NULL};
   225    // For now use default stream to avoid changing the memory allocation
   226    // interface.
   227    // TODO(lucafuji): use the stream of current execution pipeline for
   228    // allocation and free.
   229    resHandle.pStrErr = checkRMMError(RMM_ALLOC(devPtr, size, 0),
   230        "deviceMalloc");
   231    return resHandle;
   232  }
   233  
   234  CGoCallResHandle deviceFree(void *devPtr) {
   235    CGoCallResHandle resHandle = {NULL, NULL};
   236    // For now use default stream to avoid changing the memory allocation
   237    // interface.
   238    // TODO(lucafuji): use the stream of current execution pipeline for
   239    // allocation and free.
   240    resHandle.pStrErr = checkRMMError(RMM_FREE(devPtr, 0), "deviceFree");
   241    return resHandle;
   242  }
   243  
   244  CGoCallResHandle deviceMemset(void *devPtr, int value, size_t count) {
   245    CGoCallResHandle resHandle = {NULL, NULL};
   246    cudaMemset(devPtr, value, count);
   247    resHandle.pStrErr = checkCUDAError("deviceMemset");
   248    return resHandle;
   249  }
   250  
   251  CGoCallResHandle asyncCopyHostToDevice(void* dst, const void* src,
   252      size_t count, void* stream) {
   253    CGoCallResHandle resHandle = {NULL, NULL};
   254    cudaMemcpyAsync(dst, src, count,
   255                    cudaMemcpyHostToDevice, (cudaStream_t) stream);
   256    resHandle.pStrErr = checkCUDAError("asyncCopyHostToDevice");
   257    return resHandle;
   258  }