github.com/sentienttechnologies/studio-go-runner@v0.0.0-20201118202441-6d21f2ced8ee/internal/runner/resources.go

github.com/sentienttechnologies/studio-go-runner@v0.0.0-20201118202441-6d21f2ced8ee/internal/runner/resources.go (about)

     1  // Copyright 2018-2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License.
     2  
     3  package runner
     4  
     5  // This file contains the implementation of a resource tracker for a local host on
     6  // which CUDA, storage, and main motherboard resources can be found and tracked on
     7  // behalf of an application
     8  
     9  import (
    10  	"strconv"
    11  
    12  	humanize "github.com/dustin/go-humanize"
    13  	"github.com/go-stack/stack"
    14  	"github.com/jjeffery/kv" // MIT License
    15  )
    16  
    17  // DiskAllocated hold information about disk resources consumed on a specific device
    18  type DiskAllocated struct {
    19  	device string
    20  	size   uint64
    21  }
    22  
    23  // Allocated gathers together data for allocations of machine level resources
    24  // into a single data structure that can be used to track resource allocations for
    25  // tasks
    26  //
    27  type Allocated struct {
    28  	GPU  GPUAllocations
    29  	CPU  *CPUAllocated
    30  	Disk *DiskAllocated
    31  }
    32  
    33  func (alloc *Allocated) Logable() (logable []interface{}) {
    34  	logable = []interface{}{"allocated_CPU", alloc.CPU.cores, "allocated_cpu_mem", humanize.Bytes(alloc.CPU.mem),
    35  		"allocated_disk", humanize.Bytes(alloc.Disk.size)}
    36  	for i, aGPU := range alloc.GPU {
    37  		logable = append(logable, "allocated_GPU "+strconv.Itoa(i)+"_slots", aGPU.slots, "allocated_GPU"+strconv.Itoa(i)+"_mem", humanize.Bytes(aGPU.mem))
    38  	}
    39  	return logable
    40  }
    41  
    42  // AllocRequest is used by clients to make requests for specific types of machine resources
    43  //
    44  type AllocRequest struct {
    45  	MaxCPU        uint
    46  	MaxMem        uint64
    47  	MaxGPU        uint   // GPUs are allocated using slots which approximate their throughput
    48  	GPUDivisibles []uint // The small quantity of slots that are permitted for allocation for when multiple cards must be used
    49  	MaxGPUMem     uint64
    50  	MaxDisk       uint64
    51  }
    52  
    53  func (rqst *AllocRequest) Logable() (logable []interface{}) {
    54  	return []interface{}{"request_CPU", rqst.MaxCPU, "request_GPU_mem", humanize.Bytes(rqst.MaxMem),
    55  		"request_GPU", rqst.MaxGPU, "request_GPU_mem", humanize.Bytes(rqst.MaxGPUMem),
    56  		"request_disk", humanize.Bytes(rqst.MaxDisk)}
    57  }
    58  
    59  // Resources is a receiver for resource related methods used to describe execution requirements
    60  //
    61  type Resources struct{}
    62  
    63  // NewResources is used to get a receiver for dealing with the
    64  // resources being tracked by the studioml runner
    65  //
    66  func NewResources(localDisk string) (rsc *Resources, err kv.Error) {
    67  
    68  	err = initDiskResource(localDisk)
    69  
    70  	return &Resources{}, err
    71  }
    72  
    73  // Alloc will go through all requested resources and allocate them using the resource APIs.
    74  //
    75  // If any single resource be not available then the ones done so far will be released.  The use of a receiver
    76  // pointer is to make sure that the caller invokes the NewResources to populate some of the allocators with the
    77  // context they require to track consumption of some types of resources, such as selecting the disk from which
    78  // allocations will be performed.
    79  //
    80  // The caller is responsible for calling the release method when the resources are no longer needed.
    81  //
    82  // The live parameter can be used to controller whether the allocation attempts will perform
    83  // an allocation (true), or whether they will simply test (false) that the allocation would have been
    84  // completed successfully.
    85  //
    86  func (*Resources) Alloc(rqst AllocRequest, live bool) (alloc *Allocated, err kv.Error) {
    87  
    88  	alloc = &Allocated{}
    89  
    90  	// Each of the resources being allocated contain code to lock the individual resources
    91  	// the deallocation handles the release on a per resource basis
    92  
    93  	// Allocate the GPU resources first, they are typically the least available
    94  	if alloc.GPU, err = AllocGPU(rqst.MaxGPU, rqst.MaxGPUMem, rqst.GPUDivisibles, live); err != nil {
    95  		return nil, err
    96  	}
    97  
    98  	// CPU resources next
    99  	if alloc.CPU, err = AllocCPU(rqst.MaxCPU, rqst.MaxMem, live); err != nil {
   100  		if live {
   101  			alloc.Release()
   102  		}
   103  		return nil, err
   104  	}
   105  
   106  	// Lastly, disk storage
   107  	if alloc.Disk, err = AllocDisk(rqst.MaxDisk, live); err != nil {
   108  		if live {
   109  			alloc.Release()
   110  		}
   111  		return nil, err
   112  	}
   113  
   114  	return alloc, nil
   115  }
   116  
   117  // Release returns any allocated resources to the sub system from which they were obtained
   118  //
   119  func (a *Allocated) Release() (errs []kv.Error) {
   120  
   121  	errs = []kv.Error{}
   122  
   123  	if a == nil {
   124  		return []kv.Error{kv.NewError("unexpected nil supplied for the release of resources").With("stack", stack.Trace().TrimRuntime())}
   125  	}
   126  
   127  	for _, gpuAlloc := range a.GPU {
   128  		if e := ReturnGPU(gpuAlloc); e != nil {
   129  			errs = append(errs, e)
   130  		}
   131  	}
   132  
   133  	if a.CPU != nil {
   134  		a.CPU.Release()
   135  	}
   136  
   137  	if a.Disk != nil {
   138  		if err := a.Disk.Release(); err != nil {
   139  			errs = append(errs, err)
   140  		}
   141  	} else {
   142  		errs = append(errs, kv.NewError("disk block missing").With("stack", stack.Trace().TrimRuntime()))
   143  	}
   144  
   145  	return errs
   146  }