gorgonia.org/gorgonia@v0.9.17/cuda/bfc.go (about) 1 package cuda 2 3 import ( 4 "bytes" 5 "fmt" 6 7 "github.com/pkg/errors" 8 ) 9 10 const ( 11 minAllocBits = 8 12 minAllocSize = 1 << minAllocBits 13 14 freeAllocTresh = 0.75 15 ) 16 17 var nilBlock = memblock{} 18 19 // memblock is a tuple of address and the size of the block - think of it as a slicehdr, where the cap is the size 20 type memblock struct { 21 address uintptr 22 size int64 23 24 next, prev *memblock 25 } 26 27 func newMemblock(addr uintptr, size int64) *memblock { 28 return &memblock{ 29 address: addr, 30 size: size, 31 } 32 } 33 34 func (a memblock) cap() uintptr { return a.address + uintptr(a.size) } 35 36 // overlaps checks if two memblocks are overlapping. 37 func (a *memblock) overlaps(b *memblock) bool { 38 if a == b { 39 return true 40 } 41 if a.address == b.address { 42 return true // it doesn't matter how many elements there are in the memory. As long as they start at the same address, they overlap 43 } 44 45 capA := a.cap() 46 capB := b.cap() 47 48 switch { 49 case a.address < b.address: 50 if b.address < capA { 51 return true 52 } 53 case a.address > b.address: 54 if a.address < capB { 55 return true 56 } 57 } 58 return false 59 } 60 61 func (a *memblock) split(size int64) (b *memblock) { 62 if size >= a.size { 63 allocatorLogf("block %v, size %v", a, size) 64 panic("IMPOSSIBLE") 65 } 66 newAddress := a.address + uintptr(size) 67 newSize := a.size - size 68 a.size = size 69 b = newMemblock(newAddress, newSize) 70 return b 71 } 72 73 // we say a memblock is less than another memblock when: 74 // a.address < b.address and they don't both overlap 75 func (a *memblock) lt(b *memblock) bool { 76 if a.address == b.address { 77 return false 78 } 79 80 capA := a.cap() 81 82 if a.address < b.address && capA < b.address { 83 return true 84 } 85 86 // any other thing is not strictly less than 87 return false 88 } 89 90 func (a *memblock) String() string { 91 return fmt.Sprintf("{0x%x %d}", a.address, a.size) 92 } 93 94 // freelist is simply implemented as a linkedlist of memblocks 95 type freelist struct { 96 first, last *memblock 97 l int 98 } 99 100 func (l *freelist) Len() int { return l.l } 101 102 func (l *freelist) String() string { 103 var buf bytes.Buffer 104 fmt.Fprintf(&buf, "FIRST: %v, LAST %v | [", l.first, l.last) 105 for block := l.first; block != nil; block = block.next { 106 fmt.Fprintf(&buf, "%v, ", block) 107 } 108 fmt.Fprintf(&buf, "]") 109 return buf.String() 110 } 111 112 // insert inserts a block in an ordered fashion. This helps with coaalescing. 113 func (l *freelist) insert(block *memblock) { 114 allocatorLogf("Inserting %v", block) 115 if l.first == nil { 116 l.first = block 117 l.last = block 118 119 l.l++ 120 return 121 } 122 if block.address >= l.last.address { 123 allocatorLogf("greater than last") 124 overlaps := block.overlaps(l.last) 125 switch { 126 case overlaps: 127 blockCap := block.cap() 128 lastCap := l.last.cap() 129 if blockCap < lastCap { 130 return 131 } 132 l.last.size += int64(blockCap - lastCap) 133 return 134 default: 135 l.last.next = block 136 block.prev = l.last 137 block.next = nil 138 l.last = block 139 140 l.l++ 141 return 142 } 143 } 144 145 if block.address < l.first.address { 146 allocatorLogf("lt first") 147 overlaps := block.overlaps(l.first) 148 if overlaps { 149 blockCap := block.cap() 150 firstCap := l.first.cap() 151 if firstCap < blockCap { 152 return 153 } 154 l.first.size += int64(blockCap - firstCap) 155 return 156 } 157 158 l.first.prev = block 159 block.next = l.first 160 l.first = block 161 l.l++ 162 return 163 } 164 165 allocatorLogf("insert block") 166 insert: 167 for b := l.first; b != nil; b = b.next { 168 overlaps := b.overlaps(block) 169 switch { 170 case b.address < block.address && overlaps: 171 // coalesce block into b 172 blockCap := block.cap() 173 bcap := b.cap() 174 if blockCap <= bcap { 175 return // do nothing, since block is already in b 176 } 177 178 newSize := int64(bcap - blockCap) 179 b.size += newSize 180 return 181 182 case b.address < block.address && !overlaps: 183 if b.next == nil { 184 allocatorLogf("Uh oh") 185 allocatorLogf("b: %v", b) 186 allocatorLogf("l %v", l) 187 } 188 if b.next.address >= block.cap() { 189 bnext := b.next 190 b.next = block 191 block.next = bnext 192 block.prev = b 193 bnext.prev = block 194 l.l++ 195 return 196 197 } 198 case b.address == block.address: 199 if b.size > block.size { 200 return 201 } 202 b.size = block.size 203 return 204 case b.address > block.address && overlaps: 205 blockCap := block.cap() 206 bcap := b.cap() 207 if bcap <= blockCap { 208 b.address = block.address 209 b.size = block.size 210 return 211 } 212 b.address = block.address 213 b.size = block.size + int64(bcap-blockCap) 214 return 215 case b.address > block.address && !overlaps: 216 // gone too far. 217 break insert 218 default: 219 panic("WTF") 220 } 221 } 222 panic("Unreachable") 223 } 224 225 func (l *freelist) remove(block *memblock) { 226 allocatorLogf("remove %v from free list", block) 227 if l.first == block { 228 l.first = block.next 229 } else { 230 block.prev.next = block.next 231 } 232 233 if l.last == block { 234 l.last = block.prev 235 } else { 236 block.next.prev = block.prev 237 } 238 239 // cleanup 240 block.next = nil 241 block.prev = nil 242 l.l-- 243 } 244 245 // splitOrRemove returns the block that is removed from the list 246 func (l *freelist) splitOrRemove(block *memblock, aligned, size int64) { 247 if block.size > aligned { 248 split := block.split(aligned) 249 l.insert(split) 250 } 251 if block.size > size { 252 remnant := block.split(size) 253 l.insert(remnant) 254 } 255 l.remove(block) 256 } 257 258 // bfc an accounting structure for memory allocation, 259 // directly inspired by TensorFlows' Best Fit With Coalescing memory allocator, which is a type of buddy memory allocator. 260 // 261 // Why is this needed? 262 // This allocator is needed because it's been shown that: 263 // 1. allocating and copying data from Host to Device has in fact taken most amount of time. 264 // 2. allocating memory on CUDA is a blocking call even on the BatchedContext. This has the effect of making extra cgo calls and is inefficient. 265 // 3. It's more efficient to just allocate a large block of memory upfront and then manage it internally. 266 // 267 // Why does this allocator allocate aligned memory? 268 // For no reason other than performance. CUDA memory are aligned to 32-byte, 64-byte and 128 byte boundaries. 269 // While it would be significantly easier to manage memory without alignment, some additional book keeping is worth it for the performance gains. 270 // 271 // Why is the freelist just a slice of blocks? 272 // Because I'm generally a not-great programmer, and couldn't get a splay tree or a skip list to work properly. Rotating trees hurt my brain. 273 // In fact I spent more than 2 weeks getting a splay tree or skip list to test properly. In the end I thought the saner choice 274 // would be to leave this for any future developers to pick it up. 275 // 276 // How does it work? 277 // It's a bookkeeping system. Everytime memory is requested, it will go to the free list, and grab the blocks required. Any spares is then 278 // re-inserted into the free list. Spares are rarely used - mainly because they aren't aligned to the blocksizes. 279 // There is a map which tracks which address is used (and how big the block is); 280 // There is a map which tracks which addresses are free for use (and how big the block is); 281 // There is a "shortcut" map which doesn't require an iteration thru the free list for getting free stuff. 282 // There are two trackers for tracking the amount of frees and alloc calls. If the ratio is past a certain amount, the memories will be coalesced. 283 // 284 // How is the bfc used? 285 // Every VM will have a bfc (or multiple if there are multiple devices). At startup, an analysis of the inserted Program will be run 286 // which determines how much memory the VM will need to request from the device. The VM then requests TWICE as much memory (for just-in-case). 287 // Creation of new Tensors will then use call the alloc methods of the VM, for memories. 288 // 289 // Is this the Best Memory Book Keeping System? 290 // Hell No. There are better ones, but I'm not too good at implementing them. Please feel free to upgrade this. 291 // 292 // More information: 293 // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/common_runtime/bfc_allocator.cc 294 // 295 // More information about memory allocation and implementing one: 296 // https://github.com/angrave/SystemProgramming/wiki/Memory,-Part-2%3A-Implementing-a-Memory-Allocator 297 // https://www.codeproject.com/Articles/14525/Heap-Manager-for-Allocating-Memory-from-a-Shared-M 298 type bfc struct { 299 start uintptr 300 size int64 301 blockSize int64 302 reservedSize int64 303 304 freelist *freelist 305 used map[uintptr]int64 // keeps track of the sizes of each block 306 307 // statistics 308 allocated int64 309 allocs int 310 frees int 311 } 312 313 func newBFC(alignment int64) *bfc { 314 b := makeBFC(alignment) 315 return &b 316 } 317 318 func makeBFC(alignment int64) bfc { 319 return bfc{ 320 blockSize: alignment, 321 freelist: new(freelist), 322 used: make(map[uintptr]int64), 323 } 324 } 325 326 func (b *bfc) reset() { 327 b.allocated = 0 328 b.allocs = 0 329 b.frees = 0 330 331 } 332 333 func (b *bfc) reserve(start uintptr, size int64) { 334 allocatorLogf("RESERVE starts: 0x%x | size: %v", start, size) 335 b.start = start 336 b.size = size - (size % b.blockSize) 337 b.reservedSize = size 338 b.freelist.insert(newMemblock(0, size)) 339 allocatorLogf("Start: 0x%x | Size %v", b.start, b.size) 340 } 341 342 func (b *bfc) release() uintptr { 343 retVal := b.start 344 b.start = 0 345 b.size = 0 346 b.freelist = new(freelist) 347 b.used = make(map[uintptr]int64) 348 return retVal 349 } 350 351 func (b *bfc) alloc(size int64) (mem uintptr, err error) { 352 allocatorLogf("BFC Allocating %v", size) 353 allocatorLogf("before alloc: %v", b.freelist) 354 defer allocatorLogf("after alloc: %v", b.freelist) 355 enterLogScope() 356 defer leaveLogScope() 357 if size <= 0 { 358 return 0, errors.Errorf("Cannot allocate memory with size 0 or less") 359 } 360 aligned := b.align(size) 361 block := b.bestFit(aligned) 362 allocatorLogf("Got a block %v", block) 363 if block == nil { 364 // first try to coalesce 365 b.coalesce() 366 if block = b.bestFit(aligned); block == nil { 367 // then we're really OOM 368 return 0, oomError{ 369 res: b.size, 370 allocated: b.allocated, 371 } 372 } 373 374 } 375 b.freelist.splitOrRemove(block, aligned, size) 376 b.used[block.address] = size 377 378 b.allocated += size 379 b.allocs++ 380 381 return block.address + b.start, nil 382 } 383 384 func (b *bfc) free(address uintptr) { 385 allocatorLogf("BFC Free 0x%x", address) 386 enterLogScope() 387 defer leaveLogScope() 388 389 allocatorLogf("Before: %v", b.freelist) 390 defer allocatorLogf("After: %v", b.freelist) 391 392 a := address - b.start // get internal address 393 allocatorLogf("Internal address 0x%x", a) 394 size, ok := b.used[a] 395 if !ok { 396 allocatorLogf("a: 0x%x | 0x%x", a, address) 397 allocatorLogf("a: %v | %v %v", a, address, b.start) 398 return 399 // panic("Cannot free") 400 401 } 402 block := newMemblock(a, size) 403 b.freelist.insert(block) 404 delete(b.used, a) 405 406 b.allocated -= size 407 b.frees++ 408 if float64(b.frees)/float64(b.allocs) >= freeAllocTresh { 409 b.coalesce() 410 } 411 } 412 413 func (b *bfc) bestFit(size int64) (best *memblock) { 414 for block := b.freelist.first; block != nil; block = block.next { 415 if block.size >= size { 416 return block 417 } 418 } 419 return nil 420 } 421 422 // coalesce coalesces the freelist using these two rules: 423 // - address must be aligned to the alignment 424 // - if two blocks next to each other share a fencepost, then they will be merged 425 func (b *bfc) coalesce() { 426 allocatorLogf("PreCOALESCE: %v", b.freelist) 427 defer allocatorLogf("POSTCOALESCE: %v", b.freelist) 428 for block := b.freelist.first; block != nil; block = block.next { 429 if block.address%uintptr(b.blockSize) != 0 { 430 continue 431 } 432 inner: 433 for next := block.next; next != nil; next = block.next { 434 switch { 435 case block.cap() == next.address: 436 block.size += next.size 437 block.next = next.next 438 next.next = nil 439 next.prev = nil // kill i 440 441 if next == b.freelist.last { 442 b.freelist.last = block 443 } 444 445 b.freelist.l-- 446 case block.overlaps(next): 447 // unhandled yet 448 panic("Unhandled: overlapping coalesceing") 449 default: 450 break inner 451 } 452 } 453 } 454 } 455 456 func (b *bfc) align(size int64) int64 { 457 blocks := size % b.blockSize 458 if blocks == 0 { 459 return size 460 } 461 size -= blocks 462 return size + b.blockSize 463 }