github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/device_allocator.go (about)

     1  //  Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package query
    16  
    17  import (
    18  	"github.com/uber/aresdb/memutils"
    19  	"github.com/uber/aresdb/utils"
    20  	"strconv"
    21  	"sync"
    22  	"sync/atomic"
    23  	"time"
    24  	"unsafe"
    25  )
    26  
    27  //// DeviceAllocator virtually allocates devices to queries.
    28  //// It maintains device config and current query usage, allocates one or two
    29  //// devices to a query upon request. All requests to the GPU devices must go
    30  //// through this allocator in order for the virtual allocation to be effective.
    31  //// Two devices are allocated at the same time for a single query when consumer
    32  //// grade GPUs without ECC memory are used (to cross check for errors manually).
    33  //type DeviceAllocator interface {
    34  //	// Allocate a device (or two) for a query.
    35  //	// Returns the IDs of the allocated devices, or -1 in case of error.
    36  //	// Returns the same device ID if only one device is allocated.
    37  //	// Also returns the queryHandle for future references.
    38  //	// Memory requirement is guaranteed to be satisfied, thread requirement is
    39  //	// treated only as a hint. When all devices are temporarily busy, will block
    40  //	// until the request can be satisfied.
    41  //	DeviceAlloc(bytes, threads int) (deviceID0, deviceID1, queryHandle int)
    42  //	// Adjust the requirements of an existing query on the allocated devices.
    43  //	// This is used when a new batch of data (of different size) is about to be
    44  //	// transferred and processed for a query. The call always succeeds and returns
    45  //	// true when bytes decreases; however, when bytes increases, it may fail and
    46  //	// return false when failFast=true, or block wait for an extended amount of
    47  //	// time (it can still fail after the wait).
    48  //	// One protocol to handle increased bytes is:
    49  //	//   if DeviceRealloc(bytes=new, failFast=true) {
    50  //	//     return success
    51  //	//   }
    52  //	//   // Keep the query result only and free up most memory.
    53  //	//   DeviceRealloc(bytes=result_only)
    54  //	//   // DeviceAllocator will remember this query's intent for bigger memory.
    55  //	//   if DeviceRealloc(bytes=new, failFast=false) {
    56  //	//     return success
    57  //	//   }
    58  //	//   // fail the query due to unsatisfiable requirement.
    59  //	//   // Free up all space.
    60  //	//   // deviceFree()
    61  //	//   return failure
    62  //	DeviceRealloc(queryHandle, bytes, threads int, failFast bool) bool
    63  //	// Free up the resources allocated on the devices for the specified query.
    64  //	deviceFree(queryHandle int)
    65  //}
    66  var (
    67  	nullDevicePointer       = devicePointer{}
    68  	memoryReportingInterval = time.Second * 10
    69  )
    70  
    71  // devicePointer is the wrapper of actual device memory pointer plus the size it points to and which device
    72  // it belongs to.
    73  type devicePointer struct {
    74  	bytes, device int
    75  	pointer       unsafe.Pointer
    76  	// whether this pointer points to beginning of an allocated address.
    77  	allocated bool
    78  }
    79  
    80  func (p devicePointer) getPointer() unsafe.Pointer {
    81  	return p.pointer
    82  }
    83  
    84  func (p devicePointer) isNull() bool {
    85  	return p.pointer == nil
    86  }
    87  
    88  // offset returns another pointer points to the address of current ptr + offset.
    89  func (p devicePointer) offset(offset int) devicePointer {
    90  	return devicePointer{
    91  		device:  p.device,
    92  		pointer: utils.MemAccess(p.getPointer(), offset),
    93  	}
    94  }
    95  
    96  // deviceAllocator is the interface to allocate and deallocate device memory for a specific device.
    97  // Note this allocator only tracks memory usage as golang side. Any memory allocation/deallocation at
    98  // cuda side (either thrust code or our own code) is not tracked. So it's preferred to allocate the memory
    99  // at golang side and pass on the pointer to cuda code.
   100  type deviceAllocator interface {
   101  	// deviceAllocate allocates the specified amount of memory on the device.
   102  	deviceAllocate(bytes, device int) devicePointer
   103  	// deviceFree frees the specified memory from the device.
   104  	deviceFree(dp devicePointer)
   105  	// getAllocatedMemory returns allocated memory for a specific device.
   106  	getAllocatedMemory(device int) int64
   107  }
   108  
   109  var da deviceAllocator
   110  var deviceAllocatorOnce sync.Once
   111  
   112  // getDeviceAllocator returns singleton deviceAllocator instance.
   113  func getDeviceAllocator() deviceAllocator {
   114  	deviceAllocatorOnce.Do(func() {
   115  		da = newDeviceAllocator()
   116  	})
   117  	return da
   118  }
   119  
   120  // deviceAllocate is the wrapper of deviceAllocate of deviceAllocator.
   121  func deviceAllocate(bytes, device int) devicePointer {
   122  	return getDeviceAllocator().deviceAllocate(bytes, device)
   123  }
   124  
   125  // deviceFreeAndSetNil frees the specified device pointer if it's not null and set the pointer it holds to null.
   126  func deviceFreeAndSetNil(dp *devicePointer) {
   127  	if dp != nil && !dp.isNull() && dp.allocated {
   128  		getDeviceAllocator().deviceFree(*dp)
   129  		*dp = nullDevicePointer
   130  	}
   131  }
   132  
   133  func reportAllocatedMemory(deviceCount int, da deviceAllocator) {
   134  	// getAllocatedMemory may panic, therefore we should recover here
   135  	defer func() {
   136  		if r := recover(); r != nil {
   137  			var err error
   138  			switch x := r.(type) {
   139  			case string:
   140  				err = utils.StackError(nil, x)
   141  			case error:
   142  				err = utils.StackError(x, "Panic happens when reporting allocated memory")
   143  			default:
   144  				err = utils.StackError(nil, "Panic happens when reporting allocated memory %v", x)
   145  			}
   146  			utils.GetLogger().With("err", err).Error("Failed to report allocated memory")
   147  		}
   148  	}()
   149  
   150  	for device := 0; device < deviceCount; device++ {
   151  		utils.GetRootReporter().GetChildGauge(map[string]string{
   152  			"device": strconv.Itoa(device),
   153  		}, utils.AllocatedDeviceMemory).Update(float64(da.getAllocatedMemory(device)))
   154  	}
   155  }
   156  
   157  // newDeviceAllocator returns a new device allocator instances.
   158  func newDeviceAllocator() deviceAllocator {
   159  	// init may panic and crash the service. This is expected.
   160  	memutils.Init()
   161  	var da deviceAllocator
   162  	deviceCount := memutils.GetDeviceCount()
   163  	if memutils.IsPooledMemory() {
   164  		utils.GetLogger().Info("Using pooled device memory manager")
   165  		da = &pooledDeviceAllocatorImpl{}
   166  	} else {
   167  		utils.GetLogger().Info("Using memory tracking device memory manager")
   168  		da = &memoryTrackingDeviceAllocatorImpl{
   169  			memoryUsage: make([]int64, deviceCount),
   170  		}
   171  	}
   172  
   173  	// Start memory usage reporting go routine.
   174  	// Report the allocated memory of each device per memoryReportingInterval.
   175  	timer := time.NewTimer(memoryReportingInterval)
   176  	go func() {
   177  		for {
   178  			select {
   179  			case <-timer.C:
   180  				reportAllocatedMemory(deviceCount, da)
   181  				// Since we already receive the event from channel,
   182  				// there is no need to stop it and we can directly reset the timer.
   183  				timer.Reset(memoryReportingInterval)
   184  			}
   185  		}
   186  	}()
   187  	return da
   188  }
   189  
   190  // memoryTrackingDeviceAllocatorImpl maintains the memory space for each device and reports the updated memory every time an
   191  // allocation/free request is issued.
   192  type memoryTrackingDeviceAllocatorImpl struct {
   193  	memoryUsage []int64
   194  }
   195  
   196  // deviceAllocate allocates the specified amount of memory on the device. **Slice bound is not checked!!**
   197  func (d *memoryTrackingDeviceAllocatorImpl) deviceAllocate(bytes, device int) devicePointer {
   198  	dp := devicePointer{
   199  		device:    device,
   200  		bytes:     bytes,
   201  		pointer:   memutils.DeviceAllocate(bytes, device),
   202  		allocated: true,
   203  	}
   204  	atomic.AddInt64(&d.memoryUsage[device], int64(bytes))
   205  	return dp
   206  }
   207  
   208  // deviceFree frees the specified memory from the device. **Slice bound is not checked!!**
   209  func (d *memoryTrackingDeviceAllocatorImpl) deviceFree(dp devicePointer) {
   210  	memutils.DeviceFree(dp.pointer, dp.device)
   211  	atomic.AddInt64(&d.memoryUsage[dp.device], int64(-dp.bytes))
   212  }
   213  
   214  // getAllocatedMemory returns memory allocated by this device allocator. Note this
   215  // might be different from the actual device allocated for this device. As thrust
   216  // memory allocation is not tracked here.
   217  func (d *memoryTrackingDeviceAllocatorImpl) getAllocatedMemory(device int) int64 {
   218  	return d.memoryUsage[device]
   219  }
   220  
   221  // pooledDeviceAllocatorImpl just delegates every call to underlying pooled memory manager.
   222  type pooledDeviceAllocatorImpl struct {
   223  }
   224  
   225  // deviceAllocate allocates the specified amount of memory on the device. **Slice bound is not checked!!**
   226  func (d *pooledDeviceAllocatorImpl) deviceAllocate(bytes, device int) devicePointer {
   227  	dp := devicePointer{
   228  		device:    device,
   229  		bytes:     bytes,
   230  		pointer:   memutils.DeviceAllocate(bytes, device),
   231  		allocated: true,
   232  	}
   233  	return dp
   234  }
   235  
   236  // deviceFree frees the specified memory from the device. **Slice bound is not checked!!**
   237  func (d *pooledDeviceAllocatorImpl) deviceFree(dp devicePointer) {
   238  	memutils.DeviceFree(dp.pointer, dp.device)
   239  }
   240  
   241  // getAllocatedMemory returns memory allocated for a specific device.
   242  func (d *pooledDeviceAllocatorImpl) getAllocatedMemory(device int) int64 {
   243  	free, total := memutils.GetDeviceMemoryInfo(device)
   244  	return int64(total - free)
   245  }