github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/device_allocator.go (about) 1 // Copyright (c) 2017-2018 Uber Technologies, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package query 16 17 import ( 18 "github.com/uber/aresdb/memutils" 19 "github.com/uber/aresdb/utils" 20 "strconv" 21 "sync" 22 "sync/atomic" 23 "time" 24 "unsafe" 25 ) 26 27 //// DeviceAllocator virtually allocates devices to queries. 28 //// It maintains device config and current query usage, allocates one or two 29 //// devices to a query upon request. All requests to the GPU devices must go 30 //// through this allocator in order for the virtual allocation to be effective. 31 //// Two devices are allocated at the same time for a single query when consumer 32 //// grade GPUs without ECC memory are used (to cross check for errors manually). 33 //type DeviceAllocator interface { 34 // // Allocate a device (or two) for a query. 35 // // Returns the IDs of the allocated devices, or -1 in case of error. 36 // // Returns the same device ID if only one device is allocated. 37 // // Also returns the queryHandle for future references. 38 // // Memory requirement is guaranteed to be satisfied, thread requirement is 39 // // treated only as a hint. When all devices are temporarily busy, will block 40 // // until the request can be satisfied. 41 // DeviceAlloc(bytes, threads int) (deviceID0, deviceID1, queryHandle int) 42 // // Adjust the requirements of an existing query on the allocated devices. 43 // // This is used when a new batch of data (of different size) is about to be 44 // // transferred and processed for a query. The call always succeeds and returns 45 // // true when bytes decreases; however, when bytes increases, it may fail and 46 // // return false when failFast=true, or block wait for an extended amount of 47 // // time (it can still fail after the wait). 48 // // One protocol to handle increased bytes is: 49 // // if DeviceRealloc(bytes=new, failFast=true) { 50 // // return success 51 // // } 52 // // // Keep the query result only and free up most memory. 53 // // DeviceRealloc(bytes=result_only) 54 // // // DeviceAllocator will remember this query's intent for bigger memory. 55 // // if DeviceRealloc(bytes=new, failFast=false) { 56 // // return success 57 // // } 58 // // // fail the query due to unsatisfiable requirement. 59 // // // Free up all space. 60 // // // deviceFree() 61 // // return failure 62 // DeviceRealloc(queryHandle, bytes, threads int, failFast bool) bool 63 // // Free up the resources allocated on the devices for the specified query. 64 // deviceFree(queryHandle int) 65 //} 66 var ( 67 nullDevicePointer = devicePointer{} 68 memoryReportingInterval = time.Second * 10 69 ) 70 71 // devicePointer is the wrapper of actual device memory pointer plus the size it points to and which device 72 // it belongs to. 73 type devicePointer struct { 74 bytes, device int 75 pointer unsafe.Pointer 76 // whether this pointer points to beginning of an allocated address. 77 allocated bool 78 } 79 80 func (p devicePointer) getPointer() unsafe.Pointer { 81 return p.pointer 82 } 83 84 func (p devicePointer) isNull() bool { 85 return p.pointer == nil 86 } 87 88 // offset returns another pointer points to the address of current ptr + offset. 89 func (p devicePointer) offset(offset int) devicePointer { 90 return devicePointer{ 91 device: p.device, 92 pointer: utils.MemAccess(p.getPointer(), offset), 93 } 94 } 95 96 // deviceAllocator is the interface to allocate and deallocate device memory for a specific device. 97 // Note this allocator only tracks memory usage as golang side. Any memory allocation/deallocation at 98 // cuda side (either thrust code or our own code) is not tracked. So it's preferred to allocate the memory 99 // at golang side and pass on the pointer to cuda code. 100 type deviceAllocator interface { 101 // deviceAllocate allocates the specified amount of memory on the device. 102 deviceAllocate(bytes, device int) devicePointer 103 // deviceFree frees the specified memory from the device. 104 deviceFree(dp devicePointer) 105 // getAllocatedMemory returns allocated memory for a specific device. 106 getAllocatedMemory(device int) int64 107 } 108 109 var da deviceAllocator 110 var deviceAllocatorOnce sync.Once 111 112 // getDeviceAllocator returns singleton deviceAllocator instance. 113 func getDeviceAllocator() deviceAllocator { 114 deviceAllocatorOnce.Do(func() { 115 da = newDeviceAllocator() 116 }) 117 return da 118 } 119 120 // deviceAllocate is the wrapper of deviceAllocate of deviceAllocator. 121 func deviceAllocate(bytes, device int) devicePointer { 122 return getDeviceAllocator().deviceAllocate(bytes, device) 123 } 124 125 // deviceFreeAndSetNil frees the specified device pointer if it's not null and set the pointer it holds to null. 126 func deviceFreeAndSetNil(dp *devicePointer) { 127 if dp != nil && !dp.isNull() && dp.allocated { 128 getDeviceAllocator().deviceFree(*dp) 129 *dp = nullDevicePointer 130 } 131 } 132 133 func reportAllocatedMemory(deviceCount int, da deviceAllocator) { 134 // getAllocatedMemory may panic, therefore we should recover here 135 defer func() { 136 if r := recover(); r != nil { 137 var err error 138 switch x := r.(type) { 139 case string: 140 err = utils.StackError(nil, x) 141 case error: 142 err = utils.StackError(x, "Panic happens when reporting allocated memory") 143 default: 144 err = utils.StackError(nil, "Panic happens when reporting allocated memory %v", x) 145 } 146 utils.GetLogger().With("err", err).Error("Failed to report allocated memory") 147 } 148 }() 149 150 for device := 0; device < deviceCount; device++ { 151 utils.GetRootReporter().GetChildGauge(map[string]string{ 152 "device": strconv.Itoa(device), 153 }, utils.AllocatedDeviceMemory).Update(float64(da.getAllocatedMemory(device))) 154 } 155 } 156 157 // newDeviceAllocator returns a new device allocator instances. 158 func newDeviceAllocator() deviceAllocator { 159 // init may panic and crash the service. This is expected. 160 memutils.Init() 161 var da deviceAllocator 162 deviceCount := memutils.GetDeviceCount() 163 if memutils.IsPooledMemory() { 164 utils.GetLogger().Info("Using pooled device memory manager") 165 da = &pooledDeviceAllocatorImpl{} 166 } else { 167 utils.GetLogger().Info("Using memory tracking device memory manager") 168 da = &memoryTrackingDeviceAllocatorImpl{ 169 memoryUsage: make([]int64, deviceCount), 170 } 171 } 172 173 // Start memory usage reporting go routine. 174 // Report the allocated memory of each device per memoryReportingInterval. 175 timer := time.NewTimer(memoryReportingInterval) 176 go func() { 177 for { 178 select { 179 case <-timer.C: 180 reportAllocatedMemory(deviceCount, da) 181 // Since we already receive the event from channel, 182 // there is no need to stop it and we can directly reset the timer. 183 timer.Reset(memoryReportingInterval) 184 } 185 } 186 }() 187 return da 188 } 189 190 // memoryTrackingDeviceAllocatorImpl maintains the memory space for each device and reports the updated memory every time an 191 // allocation/free request is issued. 192 type memoryTrackingDeviceAllocatorImpl struct { 193 memoryUsage []int64 194 } 195 196 // deviceAllocate allocates the specified amount of memory on the device. **Slice bound is not checked!!** 197 func (d *memoryTrackingDeviceAllocatorImpl) deviceAllocate(bytes, device int) devicePointer { 198 dp := devicePointer{ 199 device: device, 200 bytes: bytes, 201 pointer: memutils.DeviceAllocate(bytes, device), 202 allocated: true, 203 } 204 atomic.AddInt64(&d.memoryUsage[device], int64(bytes)) 205 return dp 206 } 207 208 // deviceFree frees the specified memory from the device. **Slice bound is not checked!!** 209 func (d *memoryTrackingDeviceAllocatorImpl) deviceFree(dp devicePointer) { 210 memutils.DeviceFree(dp.pointer, dp.device) 211 atomic.AddInt64(&d.memoryUsage[dp.device], int64(-dp.bytes)) 212 } 213 214 // getAllocatedMemory returns memory allocated by this device allocator. Note this 215 // might be different from the actual device allocated for this device. As thrust 216 // memory allocation is not tracked here. 217 func (d *memoryTrackingDeviceAllocatorImpl) getAllocatedMemory(device int) int64 { 218 return d.memoryUsage[device] 219 } 220 221 // pooledDeviceAllocatorImpl just delegates every call to underlying pooled memory manager. 222 type pooledDeviceAllocatorImpl struct { 223 } 224 225 // deviceAllocate allocates the specified amount of memory on the device. **Slice bound is not checked!!** 226 func (d *pooledDeviceAllocatorImpl) deviceAllocate(bytes, device int) devicePointer { 227 dp := devicePointer{ 228 device: device, 229 bytes: bytes, 230 pointer: memutils.DeviceAllocate(bytes, device), 231 allocated: true, 232 } 233 return dp 234 } 235 236 // deviceFree frees the specified memory from the device. **Slice bound is not checked!!** 237 func (d *pooledDeviceAllocatorImpl) deviceFree(dp devicePointer) { 238 memutils.DeviceFree(dp.pointer, dp.device) 239 } 240 241 // getAllocatedMemory returns memory allocated for a specific device. 242 func (d *pooledDeviceAllocatorImpl) getAllocatedMemory(device int) int64 { 243 free, total := memutils.GetDeviceMemoryInfo(device) 244 return int64(total - free) 245 }