github.com/sentienttechnologies/studio-go-runner@v0.0.0-20201118202441-6d21f2ced8ee/internal/runner/cuda.go (about) 1 // Copyright 2018-2020 (c) Cognizant Digital Business, Evolutionary AI. All rights reserved. Issued under the Apache 2.0 License. 2 3 package runner 4 5 // This file contains the data structures used by the CUDA package that are used 6 // for when the platform is and is not supported 7 8 import ( 9 "context" 10 "fmt" 11 "os" 12 "sort" 13 "strconv" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/davecgh/go-spew/spew" 19 "github.com/go-stack/stack" 20 "github.com/jjeffery/kv" // MIT License 21 "github.com/rs/xid" 22 23 "github.com/lthibault/jitterbug" 24 25 "github.com/mitchellh/copystructure" 26 ) 27 28 type device struct { 29 UUID string `json:"uuid"` 30 Name string `json:"name"` 31 Temp uint `json:"temp"` 32 Powr uint `json:"powr"` 33 MemTot uint64 `json:"memtot"` 34 MemUsed uint64 `json:"memused"` 35 MemFree uint64 `json:"memfree"` 36 EccFailure *kv.Error `json:"eccfailure"` 37 } 38 39 type cudaDevices struct { 40 Devices []device `json:"devices"` 41 } 42 43 // GPUTrack is used to track usage of GPU cards and any kv.generated by the cards 44 // at the hardware level 45 // 46 type GPUTrack struct { 47 UUID string // The UUID designation for the GPU being managed 48 Slots uint // The number of logical slots the GPU based on its size has 49 Mem uint64 // The amount of memory the GPU posses 50 FreeSlots uint // The number of free logical slots the GPU has available 51 FreeMem uint64 // The amount of free memory the GPU has 52 EccFailure *kv.Error // Any Ecc failure related error messages, nil if no kv.encountered 53 Tracking map[string]struct{} // Used to validate allocations as they are release 54 } 55 56 type gpuTracker struct { 57 Allocs map[string]*GPUTrack 58 sync.Mutex 59 } 60 61 var ( 62 // A map keyed on the nvidia device UUID containing information about cards and 63 // their occupancy by the go runner. 64 // 65 gpuAllocs gpuTracker 66 67 // UseGPU is used for specific types of testing to disable GPU tests when there 68 // are GPU cards potentially present but they need to be disabled, this flag 69 // is not used during production to change behavior in any way 70 UseGPU *bool 71 72 // CudaInitErr records the result of the CUDA library initialization that would 73 // impact ongoing operation 74 CudaInitErr *kv.Error 75 76 // CudaInitWarnings records warnings and kv.that are deemed not be be fatal 77 // to the ongoing CUDA library usage but are of importance 78 CudaInitWarnings = []kv.Error{} 79 80 // CudaInTest is used to check if the running process is a go test process, if so then 81 // this will disable certain types of checking when using very limited GPU 82 // Hardware 83 CudaInTest = false 84 ) 85 86 func init() { 87 temp := true 88 UseGPU = &temp 89 90 gpuDevices, err := getCUDAInfo() 91 if err != nil { 92 CudaInitErr = &err 93 CudaInitWarnings = append(CudaInitWarnings, err) 94 return 95 } 96 97 devs := os.Getenv("CUDA_VISIBLE_DEVICES") 98 if len(devs) == 0 { 99 devs = os.Getenv("NVIDIA_VISIBLE_DEVICES") 100 } 101 102 visDevices := strings.Split(devs, ",") 103 104 if devs == "all" { 105 visDevices = make([]string, 0, len(gpuDevices.Devices)) 106 for _, device := range gpuDevices.Devices { 107 visDevices = append(visDevices, device.UUID) 108 } 109 } 110 111 gpuAllocs.Lock() 112 defer gpuAllocs.Unlock() 113 gpuAllocs.Allocs = make(map[string]*GPUTrack, len(visDevices)) 114 115 // If the visDevices were specified use then to generate existing entries inside the device map. 116 // These entries will then get filled in later. 117 // 118 // Look to see if we have any index values in here, it really should be all UUID strings. 119 // Warn if we find some, but still continue. 120 warned := false 121 for _, id := range visDevices { 122 if len(id) == 0 { 123 continue 124 } 125 if i, err := strconv.Atoi(id); err == nil { 126 if !warned { 127 warned = true 128 CudaInitWarnings = append(CudaInitWarnings, kv.NewError("CUDA_VISIBLE_DEVICES should be using UUIDs not indexes").With("stack", stack.Trace().TrimRuntime())) 129 } 130 if i > len(gpuDevices.Devices) { 131 CudaInitWarnings = append(CudaInitWarnings, kv.NewError("CUDA_VISIBLE_DEVICES contained an index past the known population of GPU cards").With("stack", stack.Trace().TrimRuntime())) 132 } 133 gpuAllocs.Allocs[gpuDevices.Devices[i].UUID] = &GPUTrack{Tracking: map[string]struct{}{}} 134 } else { 135 gpuAllocs.Allocs[id] = &GPUTrack{Tracking: map[string]struct{}{}} 136 } 137 } 138 139 if len(gpuAllocs.Allocs) == 0 { 140 for _, dev := range gpuDevices.Devices { 141 gpuAllocs.Allocs[dev.UUID] = &GPUTrack{Tracking: map[string]struct{}{}} 142 } 143 } 144 145 // Scan the inventory, checking matches if they were specified in the visibility env var and then fill 146 // in real world data 147 // 148 for _, dev := range gpuDevices.Devices { 149 // Dont include devices that were not specified by CUDA_VISIBLE_DEVICES 150 if _, isPresent := gpuAllocs.Allocs[dev.UUID]; !isPresent { 151 fmt.Println("GPU Skipped", dev.UUID) 152 continue 153 } 154 155 track := &GPUTrack{ 156 UUID: dev.UUID, 157 Mem: dev.MemFree, 158 EccFailure: dev.EccFailure, 159 Tracking: map[string]struct{}{}, 160 } 161 switch { 162 case strings.Contains(dev.Name, "GTX 1050"), 163 strings.Contains(dev.Name, "GTX 1060"): 164 track.Slots = 2 165 case strings.Contains(dev.Name, "GTX 1070"), 166 strings.Contains(dev.Name, "GTX 1080"): 167 track.Slots = 2 168 case strings.Contains(dev.Name, "TITAN X"): 169 track.Slots = 2 170 case strings.Contains(dev.Name, "RTX 2080 Ti"): 171 track.Slots = 2 172 case strings.Contains(dev.Name, "Tesla K80"): 173 track.Slots = 2 174 case strings.Contains(dev.Name, "Tesla P40"): 175 track.Slots = 4 176 case strings.Contains(dev.Name, "Tesla P100"): 177 track.Slots = 8 178 case strings.Contains(dev.Name, "Tesla V100"): 179 track.Slots = 16 180 default: 181 CudaInitWarnings = append(CudaInitWarnings, kv.NewError("unrecognized gpu device").With("gpu_name", dev.Name).With("gpu_uuid", dev.UUID).With("stack", stack.Trace().TrimRuntime())) 182 } 183 track.FreeSlots = track.Slots 184 track.FreeMem = track.Mem 185 gpuAllocs.Allocs[dev.UUID] = track 186 } 187 } 188 189 func GetCUDAInfo() (outDevs cudaDevices, err kv.Error) { 190 return getCUDAInfo() 191 } 192 193 // GPUInventory can be used to extract a copy of the current state of the GPU hardware seen within the 194 // runner 195 func GPUInventory() (gpus []GPUTrack, err kv.Error) { 196 197 gpus = []GPUTrack{} 198 199 gpuAllocs.Lock() 200 defer gpuAllocs.Unlock() 201 202 for _, alloc := range gpuAllocs.Allocs { 203 cpy, errGo := copystructure.Copy(*alloc) 204 if errGo != nil { 205 return nil, kv.Wrap(errGo).With("stack", stack.Trace().TrimRuntime()) 206 } 207 gpus = append(gpus, cpy.(GPUTrack)) 208 } 209 return gpus, nil 210 } 211 212 // MonitorGPUs will having initialized all of the devices in the tracking map 213 // when started as a go function check the devices for ECC and other kv.marking 214 // failed GPUs 215 // 216 func MonitorGPUs(ctx context.Context, statusC chan<- []string, errC chan<- kv.Error) { 217 // Take all of the warnings etc that were gathered during initialization and 218 // get them back to the error handling listener 219 for _, warn := range CudaInitWarnings { 220 select { 221 case errC <- warn: 222 case <-time.After(time.Second): 223 // last gasp attempt to output the error 224 fmt.Println(warn) 225 } 226 } 227 228 firstTime := true 229 230 t := jitterbug.New(time.Second*30, &jitterbug.Norm{Stdev: time.Second * 3}) 231 defer t.Stop() 232 233 for { 234 select { 235 case <-t.C: 236 gpuDevices, err := getCUDAInfo() 237 if err != nil { 238 select { 239 case errC <- err: 240 default: 241 // last gasp attempt to output the error 242 fmt.Println(err) 243 } 244 } 245 // Look at allhe GPUs we have in our hardware config 246 for _, dev := range gpuDevices.Devices { 247 if firstTime { 248 msg := []string{"gpu found", "name", dev.Name, "uuid", dev.UUID, "stack", stack.Trace().TrimRuntime().String()} 249 select { 250 case statusC <- msg: 251 case <-time.After(time.Second): 252 fmt.Println(msg) 253 } 254 } 255 if dev.EccFailure != nil { 256 gpuAllocs.Lock() 257 // Check to see if the hardware GPU had a failure 258 // and if it is in the tracking table and does 259 // not yet have an error logged log the error 260 // in the tracking table 261 if gpu, isPresent := gpuAllocs.Allocs[dev.UUID]; isPresent { 262 if gpu.EccFailure == nil { 263 gpu.EccFailure = dev.EccFailure 264 gpuAllocs.Allocs[gpu.UUID] = gpu 265 } 266 } 267 gpuAllocs.Unlock() 268 select { 269 case errC <- *dev.EccFailure: 270 default: 271 // last gasp attempt to output the error 272 fmt.Println(dev.EccFailure) 273 } 274 } 275 } 276 firstTime = false 277 case <-ctx.Done(): 278 return 279 } 280 } 281 } 282 283 // GPUCount returns the number of allocatable GPU resources 284 func GPUCount() (cnt int) { 285 gpuAllocs.Lock() 286 defer gpuAllocs.Unlock() 287 288 return len(gpuAllocs.Allocs) 289 } 290 291 // GPUSlots gets the free and total number of GPU capacity slots within 292 // the machine 293 // 294 func GPUSlots() (cnt uint, freeCnt uint) { 295 gpuAllocs.Lock() 296 defer gpuAllocs.Unlock() 297 298 for _, alloc := range gpuAllocs.Allocs { 299 cnt += alloc.Slots 300 freeCnt += alloc.FreeSlots 301 } 302 return cnt, freeCnt 303 } 304 305 // LargestFreeGPUSlots gets the largest number of single device free GPU slots 306 // 307 func LargestFreeGPUSlots() (cnt uint) { 308 gpuAllocs.Lock() 309 defer gpuAllocs.Unlock() 310 311 for _, alloc := range gpuAllocs.Allocs { 312 if alloc.FreeSlots > cnt { 313 cnt = alloc.FreeSlots 314 } 315 } 316 return cnt 317 } 318 319 // TotalFreeGPUSlots gets the largest number of single device free GPU slots 320 // 321 func TotalFreeGPUSlots() (cnt uint) { 322 gpuAllocs.Lock() 323 defer gpuAllocs.Unlock() 324 325 for _, alloc := range gpuAllocs.Allocs { 326 cnt += alloc.FreeSlots 327 } 328 return cnt 329 } 330 331 // LargestFreeGPUMem will obtain the largest number of available GPU slots 332 // on any of the individual cards accessible to the runner 333 func LargestFreeGPUMem() (freeMem uint64) { 334 gpuAllocs.Lock() 335 defer gpuAllocs.Unlock() 336 337 for _, alloc := range gpuAllocs.Allocs { 338 if alloc.Slots != 0 && alloc.FreeMem > freeMem { 339 freeMem = alloc.FreeMem 340 } 341 } 342 return freeMem 343 } 344 345 // GPUAllocated is used to record the allocation/reservation of a GPU resource on behalf of a caller 346 // 347 type GPUAllocated struct { 348 tracking string // Allocation tracking ID 349 uuid string // The device identifier this allocation was successful against 350 slots uint // The number of GPU slots given from the allocation 351 mem uint64 // The amount of memory given to the allocation 352 Env map[string]string // Any environment variables the device allocator wants the runner to use 353 } 354 355 // GPUAllocations records the allocations that together are present to a caller. 356 // 357 type GPUAllocations []*GPUAllocated 358 359 // AllocGPU will select the default allocation pool for GPUs and call the allocation for it. 360 // 361 func AllocGPU(maxGPU uint, maxGPUMem uint64, unitsOfAllocation []uint, live bool) (alloc GPUAllocations, err kv.Error) { 362 return gpuAllocs.AllocGPU(maxGPU, maxGPUMem, unitsOfAllocation, live) 363 } 364 365 func evens(start int, end int) (result []int) { 366 result = []int{start} 367 inc := 1 368 for cur := start + 1; cur < end+1; cur += inc { 369 if cur%2 == 0 { 370 result = append(result, cur) 371 inc = 2 372 } 373 } 374 return result 375 } 376 377 // AllocGPU will attempt to find a free CUDA capable GPU from a supplied allocator pool 378 // and assign it to the client. It will on finding a device set the appropriate values 379 // in the allocated return structure that the client can use to manage their resource 380 // consumption to match the permitted limits. 381 // 382 // When allocations occur across multiple devices the units of allocation parameter 383 // defines the grainularity that the cards must conform to in terms of slots. 384 // 385 // Any allocations will take an entire card, we do not break cards across experiments 386 // 387 // This receiver uses a user supplied pool which allows for unit tests to be written that use a 388 // custom pool 389 // 390 // The live parameter if false can be used to test if the allocation would be successful 391 // without performing it. If live false is used no allocation will be returned and err will be nil 392 // if the allocation have been successful. 393 // 394 func (allocator *gpuTracker) AllocGPU(maxGPU uint, maxGPUMem uint64, unitsOfAllocation []uint, live bool) (alloc GPUAllocations, err kv.Error) { 395 396 alloc = GPUAllocations{} 397 398 if maxGPU == 0 && maxGPUMem == 0 { 399 return alloc, nil 400 } 401 402 // Start with the smallest granularity of allocations permitted and try and find a fit for the total, 403 // then continue up through the granularities until we have exhausted the options 404 405 // Put the units of allocation in to a searchable slice, putting the largest first 406 units := make([]int, len(unitsOfAllocation)) 407 for i, unit := range unitsOfAllocation { 408 units[i] = int(unit) 409 } 410 // If needed create an exact match definition for the case where the caller failed to 411 // supply units of allocation, and also the even numbers between the minimum number 412 // of slots for GPUs being 4 and the upper limit 413 if len(units) == 0 { 414 units = evens(2, int(maxGPU+1)*2) 415 } 416 417 sort.Slice(units, func(i, j int) bool { return units[i] < units[j] }) 418 419 // Start building logging style information to be used in the 420 // event of a real error 421 kvDetails := []interface{}{"maxGPU", maxGPU, "units", units} 422 423 // Now we lock after doing initialization of the functions own variables 424 allocator.Lock() 425 defer allocator.Unlock() 426 427 // Add a structure that will be used later to order our UUIDs 428 // by the number of free slots they have 429 type SlotsByUUID struct { 430 uuid string 431 freeSlots uint 432 } 433 slotsByUUID := make([]SlotsByUUID, 0, len(allocator.Allocs)) 434 435 // Take any cards that have the exact number of free slots that we have 436 // in our permitted units and use those, but exclude cards with 437 // ECC errors 438 usableAllocs := make(map[string]*GPUTrack, len(allocator.Allocs)) 439 for k, v := range allocator.Allocs { 440 // Cannot use this cards it is broken 441 if v.EccFailure != nil { 442 continue 443 } 444 // Make sure the units contains the value of the valid range of slots 445 // acceptable to the caller 446 pos := sort.SearchInts(units, int(v.Slots)) 447 if pos < len(units) && int(v.Slots) == units[pos] { 448 usableAllocs[k] = v 449 slotsByUUID = append(slotsByUUID, SlotsByUUID{uuid: v.UUID, freeSlots: v.FreeSlots}) 450 } 451 } 452 453 if len(slotsByUUID) == 0 { 454 kvDetails = append(kvDetails, []interface{}{"allocs", spew.Sdump(allocator.Allocs)}...) 455 return nil, kv.NewError("insufficient free GPUs").With(kvDetails...) 456 } 457 458 // Take the permitted cards and sort their UUIDs in order of the 459 // smallest number of free slots first 460 sort.Slice(slotsByUUID, func(i, j int) bool { 461 if slotsByUUID[i].freeSlots < slotsByUUID[j].freeSlots { 462 return true 463 } 464 465 if slotsByUUID[i].freeSlots > slotsByUUID[j].freeSlots { 466 return false 467 } 468 469 return slotsByUUID[i].uuid < slotsByUUID[j].uuid 470 }) 471 472 kvDetails = append(kvDetails, []interface{}{"slots", slotsByUUID}) 473 474 // Because we know the preferred allocation units we can simply start with the smallest quantity 475 // and if we can slowly build up enough of the smaller items to meet the need, that become one 476 // combination. 477 // 478 type reservation struct { 479 uuid string 480 slots uint 481 } 482 type combination struct { 483 cards []reservation 484 waste int 485 } 486 487 combinations := []combination{} 488 489 // Go though building combinations that work and track the waste for each solution. 490 // 491 for i, uuid := range slotsByUUID { 492 slotsFound := usableAllocs[uuid.uuid].FreeSlots 493 cmd := combination{cards: []reservation{{uuid: uuid.uuid, slots: usableAllocs[uuid.uuid].FreeSlots}}} 494 func() { 495 if slotsFound < maxGPU && i < len(slotsByUUID) { 496 for _, nextUUID := range slotsByUUID[i+1:] { 497 slotsFound += usableAllocs[uuid.uuid].FreeSlots 498 cmd.cards = append(cmd.cards, reservation{uuid: nextUUID.uuid, slots: usableAllocs[nextUUID.uuid].FreeSlots}) 499 } 500 501 // We have enough slots now, stop looking and go to the next largest starting point 502 if slotsFound >= maxGPU { 503 return 504 } 505 } 506 }() 507 508 // We have a combination that meets or exceeds our needs 509 if slotsFound >= maxGPU { 510 cmd.waste = int(slotsFound - maxGPU) 511 combinations = append(combinations, cmd) 512 } 513 } 514 515 if len(combinations) == 0 { 516 kvDetails = append(kvDetails, "stack", stack.Trace().TrimRuntime()) 517 return nil, kv.NewError("insufficient GPU").With(kvDetails...) 518 } 519 520 // Sort the combinations by waste, get the least waste 521 // 522 sort.Slice(combinations, func(i, j int) bool { return combinations[i].waste < combinations[j].waste }) 523 524 // Get all of the combinations that have the least and same waste in slots 525 minWaste := combinations[0].waste 526 for i, comb := range combinations { 527 if minWaste != comb.waste { 528 combinations = combinations[:i] 529 break 530 } 531 } 532 533 // Sort what is left over by the number of impacted cards 534 sort.Slice(combinations, func(i, j int) bool { return len(combinations[i].cards) < len(combinations[j].cards) }) 535 kvDetails = append(kvDetails, []interface{}{"combinations", combinations}...) 536 537 // OK Now we simply take the first option if one was found 538 matched := combinations[0] 539 540 if len(matched.cards) == 0 { 541 kvDetails = append(kvDetails, "stack", stack.Trace().TrimRuntime()) 542 return nil, kv.NewError("insufficient partitioned GPUs").With(kvDetails...) 543 } 544 545 // Got as far as knowing the allocation will work so check for the live flag 546 if !live { 547 return nil, nil 548 } 549 550 // Go through the chosen combination of cards and do the allocations 551 // 552 for _, found := range matched.cards { 553 slots := maxGPU 554 if slots > allocator.Allocs[found.uuid].FreeSlots { 555 slots = allocator.Allocs[found.uuid].FreeSlots 556 } 557 558 if maxGPUMem == 0 { 559 // If the user does not know take it all, burn it to the ground 560 slots = allocator.Allocs[found.uuid].FreeSlots 561 maxGPUMem = allocator.Allocs[found.uuid].FreeMem 562 } 563 allocator.Allocs[found.uuid].FreeSlots -= slots 564 allocator.Allocs[found.uuid].FreeMem -= maxGPUMem 565 566 tracking := xid.New().String() 567 alloc = append(alloc, &GPUAllocated{ 568 tracking: tracking, 569 uuid: found.uuid, 570 slots: slots, 571 mem: maxGPUMem, 572 Env: map[string]string{ 573 "NVIDIA_VISIBLE_DEVICES": found.uuid, 574 "CUDA_VISIBLE_DEVICES": found.uuid, 575 }, 576 }) 577 578 allocator.Allocs[found.uuid].Tracking[tracking] = struct{}{} 579 } 580 581 return alloc, nil 582 } 583 584 func (allocator *gpuTracker) ReturnGPU(alloc *GPUAllocated) (err kv.Error) { 585 586 if alloc.slots == 0 { 587 return nil 588 } 589 590 allocator.Lock() 591 defer allocator.Unlock() 592 593 // Make sure that the allocation is still valid 594 if _, isPresent := allocator.Allocs[alloc.uuid]; !isPresent { 595 return kv.NewError("cuda device no longer in service").With("device", alloc.uuid).With("stack", stack.Trace().TrimRuntime()) 596 } 597 598 if _, isPresent := allocator.Allocs[alloc.uuid].Tracking[alloc.tracking]; !isPresent { 599 return kv.NewError("invalid allocation").With("alloc_id", alloc.tracking).With("stack", stack.Trace().TrimRuntime()) 600 } 601 602 delete(allocator.Allocs[alloc.uuid].Tracking, alloc.tracking) 603 604 // If valid pass back the resources that were consumed 605 allocator.Allocs[alloc.uuid].FreeSlots += alloc.slots 606 allocator.Allocs[alloc.uuid].FreeMem += alloc.mem 607 608 return nil 609 } 610 611 // ReturnGPU releases the GPU allocation passed in. It will validate some of the allocation 612 // details but is an honors system. 613 // 614 func ReturnGPU(alloc *GPUAllocated) (err kv.Error) { 615 return gpuAllocs.ReturnGPU(alloc) 616 }