gorgonia.org/gorgonia@v0.9.17/cuda/bfc.go (about)

     1  package cuda
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  
     7  	"github.com/pkg/errors"
     8  )
     9  
    10  const (
    11  	minAllocBits = 8
    12  	minAllocSize = 1 << minAllocBits
    13  
    14  	freeAllocTresh = 0.75
    15  )
    16  
    17  var nilBlock = memblock{}
    18  
    19  // memblock is a tuple of address and the size of the block - think of it as a slicehdr, where the cap is the size
    20  type memblock struct {
    21  	address uintptr
    22  	size    int64
    23  
    24  	next, prev *memblock
    25  }
    26  
    27  func newMemblock(addr uintptr, size int64) *memblock {
    28  	return &memblock{
    29  		address: addr,
    30  		size:    size,
    31  	}
    32  }
    33  
    34  func (a memblock) cap() uintptr { return a.address + uintptr(a.size) }
    35  
    36  // overlaps checks if two memblocks are overlapping.
    37  func (a *memblock) overlaps(b *memblock) bool {
    38  	if a == b {
    39  		return true
    40  	}
    41  	if a.address == b.address {
    42  		return true // it doesn't matter how many elements there are in the memory. As long as they start at the same address, they overlap
    43  	}
    44  
    45  	capA := a.cap()
    46  	capB := b.cap()
    47  
    48  	switch {
    49  	case a.address < b.address:
    50  		if b.address < capA {
    51  			return true
    52  		}
    53  	case a.address > b.address:
    54  		if a.address < capB {
    55  			return true
    56  		}
    57  	}
    58  	return false
    59  }
    60  
    61  func (a *memblock) split(size int64) (b *memblock) {
    62  	if size >= a.size {
    63  		allocatorLogf("block %v, size %v", a, size)
    64  		panic("IMPOSSIBLE")
    65  	}
    66  	newAddress := a.address + uintptr(size)
    67  	newSize := a.size - size
    68  	a.size = size
    69  	b = newMemblock(newAddress, newSize)
    70  	return b
    71  }
    72  
    73  // we say a memblock is less than another memblock when:
    74  //		a.address < b.address and they don't both overlap
    75  func (a *memblock) lt(b *memblock) bool {
    76  	if a.address == b.address {
    77  		return false
    78  	}
    79  
    80  	capA := a.cap()
    81  
    82  	if a.address < b.address && capA < b.address {
    83  		return true
    84  	}
    85  
    86  	// any other thing is not strictly less than
    87  	return false
    88  }
    89  
    90  func (a *memblock) String() string {
    91  	return fmt.Sprintf("{0x%x %d}", a.address, a.size)
    92  }
    93  
    94  // freelist is simply implemented as a linkedlist of memblocks
    95  type freelist struct {
    96  	first, last *memblock
    97  	l           int
    98  }
    99  
   100  func (l *freelist) Len() int { return l.l }
   101  
   102  func (l *freelist) String() string {
   103  	var buf bytes.Buffer
   104  	fmt.Fprintf(&buf, "FIRST: %v, LAST %v | [", l.first, l.last)
   105  	for block := l.first; block != nil; block = block.next {
   106  		fmt.Fprintf(&buf, "%v, ", block)
   107  	}
   108  	fmt.Fprintf(&buf, "]")
   109  	return buf.String()
   110  }
   111  
   112  // insert inserts a block in an ordered fashion. This helps with coaalescing.
   113  func (l *freelist) insert(block *memblock) {
   114  	allocatorLogf("Inserting %v", block)
   115  	if l.first == nil {
   116  		l.first = block
   117  		l.last = block
   118  
   119  		l.l++
   120  		return
   121  	}
   122  	if block.address >= l.last.address {
   123  		allocatorLogf("greater than last")
   124  		overlaps := block.overlaps(l.last)
   125  		switch {
   126  		case overlaps:
   127  			blockCap := block.cap()
   128  			lastCap := l.last.cap()
   129  			if blockCap < lastCap {
   130  				return
   131  			}
   132  			l.last.size += int64(blockCap - lastCap)
   133  			return
   134  		default:
   135  			l.last.next = block
   136  			block.prev = l.last
   137  			block.next = nil
   138  			l.last = block
   139  
   140  			l.l++
   141  			return
   142  		}
   143  	}
   144  
   145  	if block.address < l.first.address {
   146  		allocatorLogf("lt first")
   147  		overlaps := block.overlaps(l.first)
   148  		if overlaps {
   149  			blockCap := block.cap()
   150  			firstCap := l.first.cap()
   151  			if firstCap < blockCap {
   152  				return
   153  			}
   154  			l.first.size += int64(blockCap - firstCap)
   155  			return
   156  		}
   157  
   158  		l.first.prev = block
   159  		block.next = l.first
   160  		l.first = block
   161  		l.l++
   162  		return
   163  	}
   164  
   165  	allocatorLogf("insert block")
   166  insert:
   167  	for b := l.first; b != nil; b = b.next {
   168  		overlaps := b.overlaps(block)
   169  		switch {
   170  		case b.address < block.address && overlaps:
   171  			// coalesce block into b
   172  			blockCap := block.cap()
   173  			bcap := b.cap()
   174  			if blockCap <= bcap {
   175  				return // do nothing, since block is already in b
   176  			}
   177  
   178  			newSize := int64(bcap - blockCap)
   179  			b.size += newSize
   180  			return
   181  
   182  		case b.address < block.address && !overlaps:
   183  			if b.next == nil {
   184  				allocatorLogf("Uh oh")
   185  				allocatorLogf("b: %v", b)
   186  				allocatorLogf("l %v", l)
   187  			}
   188  			if b.next.address >= block.cap() {
   189  				bnext := b.next
   190  				b.next = block
   191  				block.next = bnext
   192  				block.prev = b
   193  				bnext.prev = block
   194  				l.l++
   195  				return
   196  
   197  			}
   198  		case b.address == block.address:
   199  			if b.size > block.size {
   200  				return
   201  			}
   202  			b.size = block.size
   203  			return
   204  		case b.address > block.address && overlaps:
   205  			blockCap := block.cap()
   206  			bcap := b.cap()
   207  			if bcap <= blockCap {
   208  				b.address = block.address
   209  				b.size = block.size
   210  				return
   211  			}
   212  			b.address = block.address
   213  			b.size = block.size + int64(bcap-blockCap)
   214  			return
   215  		case b.address > block.address && !overlaps:
   216  			// gone too far.
   217  			break insert
   218  		default:
   219  			panic("WTF")
   220  		}
   221  	}
   222  	panic("Unreachable")
   223  }
   224  
   225  func (l *freelist) remove(block *memblock) {
   226  	allocatorLogf("remove %v from free list", block)
   227  	if l.first == block {
   228  		l.first = block.next
   229  	} else {
   230  		block.prev.next = block.next
   231  	}
   232  
   233  	if l.last == block {
   234  		l.last = block.prev
   235  	} else {
   236  		block.next.prev = block.prev
   237  	}
   238  
   239  	// cleanup
   240  	block.next = nil
   241  	block.prev = nil
   242  	l.l--
   243  }
   244  
   245  // splitOrRemove returns the block that is removed from the list
   246  func (l *freelist) splitOrRemove(block *memblock, aligned, size int64) {
   247  	if block.size > aligned {
   248  		split := block.split(aligned)
   249  		l.insert(split)
   250  	}
   251  	if block.size > size {
   252  		remnant := block.split(size)
   253  		l.insert(remnant)
   254  	}
   255  	l.remove(block)
   256  }
   257  
   258  // bfc an accounting structure for memory allocation,
   259  // directly inspired by TensorFlows' Best Fit With Coalescing memory allocator, which is a type of buddy memory allocator.
   260  //
   261  // Why is this needed?
   262  // This allocator is needed because it's been shown that:
   263  //	1. allocating and copying data from Host to Device has in fact taken most amount of time.
   264  //	2. allocating memory on CUDA is a blocking call even on the BatchedContext. This has the effect of making extra cgo calls and is inefficient.
   265  //	3. It's more efficient to just allocate a large block of memory upfront and then manage it internally.
   266  //
   267  // Why does this allocator allocate aligned memory?
   268  // For no reason other than performance. CUDA memory are aligned to 32-byte, 64-byte and 128 byte boundaries.
   269  // While it would be significantly easier to manage memory without alignment, some additional book keeping is worth it for the performance gains.
   270  //
   271  // Why is the freelist just a slice of blocks?
   272  // Because I'm generally a not-great programmer, and couldn't get a splay tree or a skip list to work properly. Rotating trees hurt my brain.
   273  // In fact I spent more than 2 weeks getting a splay tree or skip list to test properly. In the end I thought the saner choice
   274  // would be to leave this for any future developers to pick it up.
   275  //
   276  // How does it work?
   277  // It's a bookkeeping system. Everytime memory is requested, it will go to the free list, and grab the blocks required. Any spares is then
   278  // re-inserted into the free list. Spares are rarely used - mainly because they aren't aligned to the blocksizes.
   279  // There is a map which tracks which address is used (and how big the block is);
   280  // There is a map which tracks which addresses are free for use (and how big the block is);
   281  // There is a "shortcut" map which doesn't require an iteration thru the free list for getting free stuff.
   282  // There are two trackers for tracking the amount of frees and alloc calls. If the ratio is past a certain amount, the memories will be coalesced.
   283  //
   284  // How is the bfc used?
   285  // Every VM will have a bfc (or multiple if there are multiple devices). At startup, an analysis of the inserted Program will be run
   286  // which determines how much memory the VM will need to request from the device. The VM then requests TWICE as much memory (for just-in-case).
   287  // Creation of new Tensors will then use call the alloc methods of the VM, for memories.
   288  //
   289  // Is this the Best Memory Book Keeping System?
   290  // Hell No. There are better ones, but I'm not too good at implementing them. Please feel free to upgrade this.
   291  //
   292  // More information:
   293  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/common_runtime/bfc_allocator.cc
   294  //
   295  // More information about memory allocation and implementing one:
   296  // https://github.com/angrave/SystemProgramming/wiki/Memory,-Part-2%3A-Implementing-a-Memory-Allocator
   297  // https://www.codeproject.com/Articles/14525/Heap-Manager-for-Allocating-Memory-from-a-Shared-M
   298  type bfc struct {
   299  	start        uintptr
   300  	size         int64
   301  	blockSize    int64
   302  	reservedSize int64
   303  
   304  	freelist *freelist
   305  	used     map[uintptr]int64 // keeps track of the sizes of each block
   306  
   307  	// statistics
   308  	allocated int64
   309  	allocs    int
   310  	frees     int
   311  }
   312  
   313  func newBFC(alignment int64) *bfc {
   314  	b := makeBFC(alignment)
   315  	return &b
   316  }
   317  
   318  func makeBFC(alignment int64) bfc {
   319  	return bfc{
   320  		blockSize: alignment,
   321  		freelist:  new(freelist),
   322  		used:      make(map[uintptr]int64),
   323  	}
   324  }
   325  
   326  func (b *bfc) reset() {
   327  	b.allocated = 0
   328  	b.allocs = 0
   329  	b.frees = 0
   330  
   331  }
   332  
   333  func (b *bfc) reserve(start uintptr, size int64) {
   334  	allocatorLogf("RESERVE starts: 0x%x | size: %v", start, size)
   335  	b.start = start
   336  	b.size = size - (size % b.blockSize)
   337  	b.reservedSize = size
   338  	b.freelist.insert(newMemblock(0, size))
   339  	allocatorLogf("Start: 0x%x | Size %v", b.start, b.size)
   340  }
   341  
   342  func (b *bfc) release() uintptr {
   343  	retVal := b.start
   344  	b.start = 0
   345  	b.size = 0
   346  	b.freelist = new(freelist)
   347  	b.used = make(map[uintptr]int64)
   348  	return retVal
   349  }
   350  
   351  func (b *bfc) alloc(size int64) (mem uintptr, err error) {
   352  	allocatorLogf("BFC Allocating %v", size)
   353  	allocatorLogf("before alloc: %v", b.freelist)
   354  	defer allocatorLogf("after alloc: %v", b.freelist)
   355  	enterLogScope()
   356  	defer leaveLogScope()
   357  	if size <= 0 {
   358  		return 0, errors.Errorf("Cannot allocate memory with size 0 or less")
   359  	}
   360  	aligned := b.align(size)
   361  	block := b.bestFit(aligned)
   362  	allocatorLogf("Got a block %v", block)
   363  	if block == nil {
   364  		// first try to coalesce
   365  		b.coalesce()
   366  		if block = b.bestFit(aligned); block == nil {
   367  			// then we're really OOM
   368  			return 0, oomError{
   369  				res:       b.size,
   370  				allocated: b.allocated,
   371  			}
   372  		}
   373  
   374  	}
   375  	b.freelist.splitOrRemove(block, aligned, size)
   376  	b.used[block.address] = size
   377  
   378  	b.allocated += size
   379  	b.allocs++
   380  
   381  	return block.address + b.start, nil
   382  }
   383  
   384  func (b *bfc) free(address uintptr) {
   385  	allocatorLogf("BFC Free 0x%x", address)
   386  	enterLogScope()
   387  	defer leaveLogScope()
   388  
   389  	allocatorLogf("Before: %v", b.freelist)
   390  	defer allocatorLogf("After: %v", b.freelist)
   391  
   392  	a := address - b.start // get internal address
   393  	allocatorLogf("Internal address 0x%x", a)
   394  	size, ok := b.used[a]
   395  	if !ok {
   396  		allocatorLogf("a: 0x%x | 0x%x", a, address)
   397  		allocatorLogf("a: %v | %v %v", a, address, b.start)
   398  		return
   399  		// panic("Cannot free")
   400  
   401  	}
   402  	block := newMemblock(a, size)
   403  	b.freelist.insert(block)
   404  	delete(b.used, a)
   405  
   406  	b.allocated -= size
   407  	b.frees++
   408  	if float64(b.frees)/float64(b.allocs) >= freeAllocTresh {
   409  		b.coalesce()
   410  	}
   411  }
   412  
   413  func (b *bfc) bestFit(size int64) (best *memblock) {
   414  	for block := b.freelist.first; block != nil; block = block.next {
   415  		if block.size >= size {
   416  			return block
   417  		}
   418  	}
   419  	return nil
   420  }
   421  
   422  // coalesce coalesces the freelist using these two rules:
   423  //		- address must be aligned to the alignment
   424  //		- if two blocks next to each other share a fencepost, then they will be merged
   425  func (b *bfc) coalesce() {
   426  	allocatorLogf("PreCOALESCE: %v", b.freelist)
   427  	defer allocatorLogf("POSTCOALESCE: %v", b.freelist)
   428  	for block := b.freelist.first; block != nil; block = block.next {
   429  		if block.address%uintptr(b.blockSize) != 0 {
   430  			continue
   431  		}
   432  	inner:
   433  		for next := block.next; next != nil; next = block.next {
   434  			switch {
   435  			case block.cap() == next.address:
   436  				block.size += next.size
   437  				block.next = next.next
   438  				next.next = nil
   439  				next.prev = nil // kill i
   440  
   441  				if next == b.freelist.last {
   442  					b.freelist.last = block
   443  				}
   444  
   445  				b.freelist.l--
   446  			case block.overlaps(next):
   447  				// unhandled yet
   448  				panic("Unhandled: overlapping coalesceing")
   449  			default:
   450  				break inner
   451  			}
   452  		}
   453  	}
   454  }
   455  
   456  func (b *bfc) align(size int64) int64 {
   457  	blocks := size % b.blockSize
   458  	if blocks == 0 {
   459  		return size
   460  	}
   461  	size -= blocks
   462  	return size + b.blockSize
   463  }