github.com/sentienttechnologies/studio-go-runner@v0.0.0-20201118202441-6d21f2ced8ee/internal/runner/resources.go (about) 1 // Copyright 2018-2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License. 2 3 package runner 4 5 // This file contains the implementation of a resource tracker for a local host on 6 // which CUDA, storage, and main motherboard resources can be found and tracked on 7 // behalf of an application 8 9 import ( 10 "strconv" 11 12 humanize "github.com/dustin/go-humanize" 13 "github.com/go-stack/stack" 14 "github.com/jjeffery/kv" // MIT License 15 ) 16 17 // DiskAllocated hold information about disk resources consumed on a specific device 18 type DiskAllocated struct { 19 device string 20 size uint64 21 } 22 23 // Allocated gathers together data for allocations of machine level resources 24 // into a single data structure that can be used to track resource allocations for 25 // tasks 26 // 27 type Allocated struct { 28 GPU GPUAllocations 29 CPU *CPUAllocated 30 Disk *DiskAllocated 31 } 32 33 func (alloc *Allocated) Logable() (logable []interface{}) { 34 logable = []interface{}{"allocated_CPU", alloc.CPU.cores, "allocated_cpu_mem", humanize.Bytes(alloc.CPU.mem), 35 "allocated_disk", humanize.Bytes(alloc.Disk.size)} 36 for i, aGPU := range alloc.GPU { 37 logable = append(logable, "allocated_GPU "+strconv.Itoa(i)+"_slots", aGPU.slots, "allocated_GPU"+strconv.Itoa(i)+"_mem", humanize.Bytes(aGPU.mem)) 38 } 39 return logable 40 } 41 42 // AllocRequest is used by clients to make requests for specific types of machine resources 43 // 44 type AllocRequest struct { 45 MaxCPU uint 46 MaxMem uint64 47 MaxGPU uint // GPUs are allocated using slots which approximate their throughput 48 GPUDivisibles []uint // The small quantity of slots that are permitted for allocation for when multiple cards must be used 49 MaxGPUMem uint64 50 MaxDisk uint64 51 } 52 53 func (rqst *AllocRequest) Logable() (logable []interface{}) { 54 return []interface{}{"request_CPU", rqst.MaxCPU, "request_GPU_mem", humanize.Bytes(rqst.MaxMem), 55 "request_GPU", rqst.MaxGPU, "request_GPU_mem", humanize.Bytes(rqst.MaxGPUMem), 56 "request_disk", humanize.Bytes(rqst.MaxDisk)} 57 } 58 59 // Resources is a receiver for resource related methods used to describe execution requirements 60 // 61 type Resources struct{} 62 63 // NewResources is used to get a receiver for dealing with the 64 // resources being tracked by the studioml runner 65 // 66 func NewResources(localDisk string) (rsc *Resources, err kv.Error) { 67 68 err = initDiskResource(localDisk) 69 70 return &Resources{}, err 71 } 72 73 // Alloc will go through all requested resources and allocate them using the resource APIs. 74 // 75 // If any single resource be not available then the ones done so far will be released. The use of a receiver 76 // pointer is to make sure that the caller invokes the NewResources to populate some of the allocators with the 77 // context they require to track consumption of some types of resources, such as selecting the disk from which 78 // allocations will be performed. 79 // 80 // The caller is responsible for calling the release method when the resources are no longer needed. 81 // 82 // The live parameter can be used to controller whether the allocation attempts will perform 83 // an allocation (true), or whether they will simply test (false) that the allocation would have been 84 // completed successfully. 85 // 86 func (*Resources) Alloc(rqst AllocRequest, live bool) (alloc *Allocated, err kv.Error) { 87 88 alloc = &Allocated{} 89 90 // Each of the resources being allocated contain code to lock the individual resources 91 // the deallocation handles the release on a per resource basis 92 93 // Allocate the GPU resources first, they are typically the least available 94 if alloc.GPU, err = AllocGPU(rqst.MaxGPU, rqst.MaxGPUMem, rqst.GPUDivisibles, live); err != nil { 95 return nil, err 96 } 97 98 // CPU resources next 99 if alloc.CPU, err = AllocCPU(rqst.MaxCPU, rqst.MaxMem, live); err != nil { 100 if live { 101 alloc.Release() 102 } 103 return nil, err 104 } 105 106 // Lastly, disk storage 107 if alloc.Disk, err = AllocDisk(rqst.MaxDisk, live); err != nil { 108 if live { 109 alloc.Release() 110 } 111 return nil, err 112 } 113 114 return alloc, nil 115 } 116 117 // Release returns any allocated resources to the sub system from which they were obtained 118 // 119 func (a *Allocated) Release() (errs []kv.Error) { 120 121 errs = []kv.Error{} 122 123 if a == nil { 124 return []kv.Error{kv.NewError("unexpected nil supplied for the release of resources").With("stack", stack.Trace().TrimRuntime())} 125 } 126 127 for _, gpuAlloc := range a.GPU { 128 if e := ReturnGPU(gpuAlloc); e != nil { 129 errs = append(errs, e) 130 } 131 } 132 133 if a.CPU != nil { 134 a.CPU.Release() 135 } 136 137 if a.Disk != nil { 138 if err := a.Disk.Release(); err != nil { 139 errs = append(errs, err) 140 } 141 } else { 142 errs = append(errs, kv.NewError("disk block missing").With("stack", stack.Trace().TrimRuntime())) 143 } 144 145 return errs 146 }