gitlab.com/SiaPrime/SiaPrime@v1.4.1/modules/renter/uploadchunk.go (about) 1 package renter 2 3 import ( 4 "fmt" 5 "io" 6 "os" 7 "sync" 8 9 "gitlab.com/NebulousLabs/errors" 10 "gitlab.com/NebulousLabs/fastrand" 11 12 "gitlab.com/SiaPrime/SiaPrime/modules" 13 "gitlab.com/SiaPrime/SiaPrime/modules/renter/siafile" 14 ) 15 16 // uploadChunkID is a unique identifier for each chunk in the renter. 17 type uploadChunkID struct { 18 fileUID siafile.SiafileUID // Unique to each file. 19 index uint64 // Unique to each chunk within a file. 20 } 21 22 // unfinishedUploadChunk contains a chunk from the filesystem that has not 23 // finished uploading, including knowledge of the progress. 24 type unfinishedUploadChunk struct { 25 // Information about the file. localPath may be the empty string if the file 26 // is known not to exist locally. 27 id uploadChunkID 28 fileEntry *siafile.SiaFileSetEntry 29 threadUID int 30 31 // Information about the chunk, namely where it exists within the file. 32 // 33 // TODO / NOTE: As we change the file mapper, we're probably going to have 34 // to update these fields. Compatibility shouldn't be an issue because this 35 // struct is not persisted anywhere, it's always built from other 36 // structures. 37 health float64 38 index uint64 39 length uint64 40 memoryNeeded uint64 // memory needed in bytes 41 memoryReleased uint64 // memory that has been returned of memoryNeeded 42 minimumPieces int // number of pieces required to recover the file. 43 offset int64 // Offset of the chunk within the file. 44 piecesNeeded int // number of pieces to achieve a 100% complete upload 45 stuck bool // indicates if the chunk was marked as stuck during last repair 46 stuckRepair bool // indicates if the chunk was identified for repair by the stuck loop 47 priority bool // indicates if the chunks is supposed to be repaired asap 48 49 // The logical data is the data that is presented to the user when the user 50 // requests the chunk. The physical data is all of the pieces that get 51 // stored across the network. 52 logicalChunkData [][]byte 53 physicalChunkData [][]byte 54 55 // sourceReader is an optional source for the logical chunk data. If 56 // available it will be tried before the repair path or remote repair. 57 sourceReader io.ReadCloser 58 59 // Worker synchronization fields. The mutex only protects these fields. 60 // 61 // When a worker passes over a piece for upload to go on standby: 62 // + the worker should add itself to the list of standby chunks 63 // + the worker should call for memory to be released 64 // 65 // When a worker passes over a piece because it's not useful: 66 // + the worker should decrement the number of workers remaining 67 // + the worker should call for memory to be released 68 // 69 // When a worker accepts a piece for upload: 70 // + the worker should increment the number of pieces registered 71 // + the worker should mark the piece usage for the piece it is uploading 72 // + the worker should decrement the number of workers remaining 73 // 74 // When a worker completes an upload (success or failure): 75 // + the worker should decrement the number of pieces registered 76 // + the worker should call for memory to be released 77 // 78 // When a worker completes an upload (failure): 79 // + the worker should unmark the piece usage for the piece it registered 80 // + the worker should notify the standby workers of a new available piece 81 // 82 // When a worker completes an upload successfully: 83 // + the worker should increment the number of pieces completed 84 // + the worker should decrement the number of pieces registered 85 // + the worker should release the memory for the completed piece 86 mu sync.Mutex 87 pieceUsage []bool // 'true' if a piece is either uploaded, or a worker is attempting to upload that piece. 88 piecesCompleted int // number of pieces that have been fully uploaded. 89 piecesRegistered int // number of pieces that are being uploaded, but aren't finished yet (may fail). 90 released bool // whether this chunk has been released from the active chunks set. 91 unusedHosts map[string]struct{} // hosts that aren't yet storing any pieces or performing any work. 92 workersRemaining int // number of inactive workers still able to upload a piece. 93 workersStandby []*worker // workers that can be used if other workers fail. 94 95 cancelMU sync.Mutex // cancelMU needs to be held when adding to cancelWG and reading/writing canceled. 96 canceled bool // cancel the work on this chunk. 97 cancelWG sync.WaitGroup // WaitGroup to wait on after canceling the uploadchunk. 98 } 99 100 // managedNotifyStandbyWorkers is called when a worker fails to upload a piece, meaning 101 // that the standby workers may now be needed to help the piece finish 102 // uploading. 103 func (uc *unfinishedUploadChunk) managedNotifyStandbyWorkers() { 104 // Copy the standby workers into a new slice and reset it since we can't 105 // hold the lock while calling the managed function. 106 uc.mu.Lock() 107 standbyWorkers := make([]*worker, len(uc.workersStandby)) 108 copy(standbyWorkers, uc.workersStandby) 109 uc.workersStandby = uc.workersStandby[:0] 110 uc.mu.Unlock() 111 112 for i := 0; i < len(standbyWorkers); i++ { 113 standbyWorkers[i].callQueueUploadChunk(uc) 114 } 115 } 116 117 // chunkComplete checks some fields of the chunk to determine if the chunk is 118 // completed. This can either mean that it ran out of workers or that it was 119 // uploaded successfully. 120 func (uc *unfinishedUploadChunk) chunkComplete() bool { 121 // The whole chunk was uploaded successfully. 122 if uc.piecesCompleted == uc.piecesNeeded && uc.piecesRegistered == 0 { 123 return true 124 } 125 // We are no longer doing any uploads and we don't have any workers left. 126 if uc.workersRemaining == 0 && uc.piecesRegistered == 0 { 127 return true 128 } 129 return false 130 } 131 132 // readLogicalData initializes the chunk's logicalChunkData using data read from 133 // r, returning the number of bytes read. 134 func (uc *unfinishedUploadChunk) readLogicalData(r io.Reader) (uint64, error) { 135 // Allocate data pieces and fill them with data from r. 136 ec := uc.fileEntry.ErasureCode() 137 dataPieces := make([][]byte, ec.MinPieces()) 138 var total uint64 139 for i := range dataPieces { 140 dataPieces[i] = make([]byte, uc.fileEntry.PieceSize()) 141 n, err := io.ReadFull(r, dataPieces[i]) 142 total += uint64(n) 143 if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { 144 return total, errors.AddContext(err, "failed to read chunk from source reader") 145 } 146 } 147 // Encode the data pieces, forming the chunk's logical data. 148 uc.logicalChunkData, _ = ec.EncodeShards(dataPieces) 149 return total, nil 150 } 151 152 // managedDistributeChunkToWorkers will take a chunk with fully prepared 153 // physical data and distribute it to the worker pool. 154 func (r *Renter) managedDistributeChunkToWorkers(uc *unfinishedUploadChunk) { 155 // Give the chunk to each worker, marking the number of workers that have 156 // received the chunk. The workers cannot be interacted with while the 157 // renter is holding a lock, so we need to build a list of workers while 158 // under lock and then launch work jobs after that. 159 r.staticWorkerPool.mu.RLock() 160 uc.workersRemaining += len(r.staticWorkerPool.workers) 161 workers := make([]*worker, 0, len(r.staticWorkerPool.workers)) 162 for _, worker := range r.staticWorkerPool.workers { 163 workers = append(workers, worker) 164 } 165 r.staticWorkerPool.mu.RUnlock() 166 for _, worker := range workers { 167 worker.callQueueUploadChunk(uc) 168 } 169 } 170 171 // managedDownloadLogicalChunkData will fetch the logical chunk data by sending a 172 // download to the renter's downloader, and then using the data that gets 173 // returned. 174 func (r *Renter) managedDownloadLogicalChunkData(chunk *unfinishedUploadChunk) error { 175 // Determine what the download length should be. Normally it is just the 176 // chunk size, but if this is the last chunk we need to download less 177 // because the file is not that large. 178 // 179 // TODO: There is a disparity in the way that the upload and download code 180 // handle the last chunk, which may not be full sized. 181 downloadLength := chunk.length 182 if chunk.index == chunk.fileEntry.NumChunks()-1 && chunk.fileEntry.Size()%chunk.length != 0 { 183 downloadLength = chunk.fileEntry.Size() % chunk.length 184 } 185 186 // Prepare snapshot. 187 snap, err := chunk.fileEntry.Snapshot() 188 if err != nil { 189 return err 190 } 191 // Create the download. 192 buf := NewDownloadDestinationBuffer() 193 d, err := r.managedNewDownload(downloadParams{ 194 destination: buf, 195 destinationType: "buffer", 196 file: snap, 197 198 latencyTarget: 200e3, // No need to rush latency on repair downloads. 199 length: downloadLength, 200 needsMemory: false, // We already requested memory, the download memory fits inside of that. 201 offset: uint64(chunk.offset), 202 overdrive: 0, // No need to rush the latency on repair downloads. 203 priority: 0, // Repair downloads are completely de-prioritized. 204 }) 205 if err != nil { 206 return err 207 } 208 209 // Register some cleanup for when the download is done. 210 d.OnComplete(func(_ error) error { 211 // Update the access time when the download is done. 212 return chunk.fileEntry.SiaFile.UpdateAccessTime() 213 }) 214 215 // Set the in-memory buffer to nil just to be safe in case of a memory 216 // leak. 217 defer func() { 218 d.destination = nil 219 }() 220 221 // Wait for the download to complete. 222 select { 223 case <-d.completeChan: 224 case <-r.tg.StopChan(): 225 return errors.New("repair download interrupted by stop call") 226 } 227 if d.Err() != nil { 228 buf.pieces = nil 229 return d.Err() 230 } 231 chunk.logicalChunkData = buf.pieces 232 return nil 233 } 234 235 // threadedFetchAndRepairChunk will fetch the logical data for a chunk, create 236 // the physical pieces for the chunk, and then distribute them. 237 func (r *Renter) threadedFetchAndRepairChunk(chunk *unfinishedUploadChunk) { 238 err := r.tg.Add() 239 if err != nil { 240 return 241 } 242 defer r.tg.Done() 243 244 // Calculate the amount of memory needed for erasure coding. This will need 245 // to be released if there's an error before erasure coding is complete. 246 erasureCodingMemory := chunk.fileEntry.PieceSize() * uint64(chunk.fileEntry.ErasureCode().MinPieces()) 247 248 // Calculate the amount of memory to release due to already completed 249 // pieces. This memory gets released during encryption, but needs to be 250 // released if there's a failure before encryption happens. 251 var pieceCompletedMemory uint64 252 for i := 0; i < len(chunk.pieceUsage); i++ { 253 if chunk.pieceUsage[i] { 254 pieceCompletedMemory += modules.SectorSize 255 } 256 } 257 258 // Ensure that memory is released and that the chunk is cleaned up properly 259 // after the chunk is distributed. 260 // 261 // Need to ensure the erasure coding memory is released as well as the 262 // physical chunk memory. Physical chunk memory is released by setting 263 // 'workersRemaining' to zero if the repair fails before being distributed 264 // to workers. Erasure coding memory is released manually if the repair 265 // fails before the erasure coding occurs. 266 defer r.managedCleanUpUploadChunk(chunk) 267 268 // Fetch the logical data for the chunk. 269 err = r.managedFetchLogicalChunkData(chunk) 270 if err != nil { 271 // Logical data is not available, cannot upload. Chunk will not be 272 // distributed to workers, therefore set workersRemaining equal to zero. 273 // The erasure coding memory has not been released yet, be sure to 274 // release that as well. 275 chunk.logicalChunkData = nil 276 chunk.workersRemaining = 0 277 r.memoryManager.Return(erasureCodingMemory + pieceCompletedMemory) 278 chunk.memoryReleased += erasureCodingMemory + pieceCompletedMemory 279 r.log.Debugln("Fetching logical data of a chunk failed:", err) 280 281 // Mark chunk as stuck 282 r.log.Debugln("Marking chunk", chunk.id, "as stuck due to error fetching logical chunk data") 283 err = chunk.fileEntry.SetStuck(chunk.index, true) 284 if err != nil { 285 r.log.Debugln("Error marking chunk", chunk.id, "as stuck:", err) 286 } 287 return 288 } 289 290 // Create the physical pieces for the data. Immediately release the logical 291 // data. 292 // 293 // TODO: The logical data is the first few chunks of the physical data. If 294 // the memory is not being handled cleanly here, we should leverage that 295 // fact to reduce the total memory required to create the physical data. 296 // That will also change the amount of memory we need to allocate, and the 297 // number of times we need to return memory. 298 err = chunk.fileEntry.ErasureCode().Reconstruct(chunk.logicalChunkData) 299 chunk.physicalChunkData = chunk.logicalChunkData 300 chunk.logicalChunkData = nil 301 r.memoryManager.Return(erasureCodingMemory) 302 chunk.memoryReleased += erasureCodingMemory 303 if err != nil { 304 // Physical data is not available, cannot upload. Chunk will not be 305 // distributed to workers, therefore set workersRemaining equal to zero. 306 chunk.workersRemaining = 0 307 r.memoryManager.Return(pieceCompletedMemory) 308 chunk.memoryReleased += pieceCompletedMemory 309 for i := 0; i < len(chunk.physicalChunkData); i++ { 310 chunk.physicalChunkData[i] = nil 311 } 312 r.log.Debugln("Fetching physical data of a chunk failed:", err) 313 314 // Mark chunk as stuck 315 r.log.Debugln("Marking chunk", chunk.id, "as stuck due to error an error with the physical data") 316 err = chunk.fileEntry.SetStuck(chunk.index, true) 317 if err != nil { 318 r.log.Debugln("Error marking chunk", chunk.id, "as stuck:", err) 319 } 320 return 321 } 322 323 // Sanity check - we should have at least as many physical data pieces as we 324 // do elements in our piece usage. 325 if len(chunk.physicalChunkData) < len(chunk.pieceUsage) { 326 r.log.Critical("not enough physical pieces to match the upload settings of the file") 327 // Mark chunk as stuck 328 r.log.Debugln("Marking chunk", chunk.id, "as stuck due to insufficient physical pieces") 329 err = chunk.fileEntry.SetStuck(chunk.index, true) 330 if err != nil { 331 r.log.Debugln("Error marking chunk", chunk.id, "as stuck:", err) 332 } 333 return 334 } 335 // Loop through the pieces and encrypt any that are needed, while dropping 336 // any pieces that are not needed. 337 for i := 0; i < len(chunk.pieceUsage); i++ { 338 if chunk.pieceUsage[i] { 339 chunk.physicalChunkData[i] = nil 340 } else { 341 // Encrypt the piece. 342 key := chunk.fileEntry.MasterKey().Derive(chunk.index, uint64(i)) 343 chunk.physicalChunkData[i] = key.EncryptBytes(chunk.physicalChunkData[i]) 344 // If the piece was not a full sector, pad it accordingly with random bytes. 345 if short := int(modules.SectorSize) - len(chunk.physicalChunkData[i]); short > 0 { 346 // The form `append(obj, make([]T, n))` will be optimized by the 347 // compiler to eliminate unneeded allocations starting go 1.11. 348 chunk.physicalChunkData[i] = append(chunk.physicalChunkData[i], make([]byte, short)...) 349 fastrand.Read(chunk.physicalChunkData[i][len(chunk.physicalChunkData[i])-short:]) 350 } 351 } 352 } 353 // Return the released memory. 354 if pieceCompletedMemory > 0 { 355 r.memoryManager.Return(pieceCompletedMemory) 356 chunk.memoryReleased += pieceCompletedMemory 357 } 358 359 // Distribute the chunk to the workers. 360 r.managedDistributeChunkToWorkers(chunk) 361 } 362 363 // managedFetchLogicalChunkData will get the raw data for a chunk, pulling it from disk if 364 // possible but otherwise queueing a download. 365 // 366 // chunk.data should be passed as 'nil' to the download, to keep memory usage as 367 // light as possible. 368 func (r *Renter) managedFetchLogicalChunkData(chunk *unfinishedUploadChunk) error { 369 // If a sourceReader is available, use it. 370 if chunk.sourceReader != nil { 371 defer chunk.sourceReader.Close() 372 n, err := chunk.readLogicalData(chunk.sourceReader) 373 if err != nil { 374 return err 375 } 376 // Adjust the filesize. Since we don't know the length of the stream 377 // beforehand we simply assume that a whole chunk will be added to the 378 // file. That's why we subtract the difference between the size of a 379 // chunk and n here. 380 adjustedSize := chunk.fileEntry.Size() - chunk.length + n 381 if errSize := chunk.fileEntry.SetFileSize(adjustedSize); errSize != nil { 382 return errors.AddContext(errSize, "failed to adjust FileSize") 383 } 384 return nil 385 } 386 387 // Download the chunk if it's not on disk. 388 if chunk.fileEntry.LocalPath() == "" { 389 return r.managedDownloadLogicalChunkData(chunk) 390 } 391 392 // Try to read the data from disk. If that fails, fallback to downloading. 393 err := func() error { 394 osFile, err := os.Open(chunk.fileEntry.LocalPath()) 395 if err != nil { 396 return err 397 } 398 defer osFile.Close() 399 sr := io.NewSectionReader(osFile, chunk.offset, int64(chunk.length)) 400 _, err = chunk.readLogicalData(sr) 401 return err 402 }() 403 if err != nil { 404 r.log.Debugln("failed to read file, downloading instead:", err) 405 return r.managedDownloadLogicalChunkData(chunk) 406 } 407 return nil 408 } 409 410 // managedCleanUpUploadChunk will check the state of the chunk and perform any 411 // cleanup required. This can include returning rememory and releasing the chunk 412 // from the map of active chunks in the chunk heap. 413 func (r *Renter) managedCleanUpUploadChunk(uc *unfinishedUploadChunk) { 414 uc.mu.Lock() 415 piecesAvailable := 0 416 var memoryReleased uint64 417 // Release any unnecessary pieces, counting any pieces that are 418 // currently available. 419 for i := 0; i < len(uc.pieceUsage); i++ { 420 // Skip the piece if it's not available. 421 if uc.pieceUsage[i] { 422 continue 423 } 424 425 // If we have all the available pieces we need, release this piece. 426 // Otherwise, mark that there's another piece available. This algorithm 427 // will prefer releasing later pieces, which improves computational 428 // complexity for erasure coding. 429 if piecesAvailable >= uc.workersRemaining { 430 memoryReleased += modules.SectorSize 431 if len(uc.physicalChunkData) < len(uc.pieceUsage) { 432 // TODO handle this. Might happen if erasure coding the chunk failed. 433 } 434 uc.physicalChunkData[i] = nil 435 // Mark this piece as taken so that we don't double release memory. 436 uc.pieceUsage[i] = true 437 } else { 438 piecesAvailable++ 439 } 440 } 441 442 // Check if the chunk needs to be removed from the list of active 443 // chunks. It needs to be removed if the chunk is complete, but hasn't 444 // yet been released. 445 chunkComplete := uc.chunkComplete() 446 released := uc.released 447 if chunkComplete && !released { 448 uc.released = true 449 } 450 uc.memoryReleased += uint64(memoryReleased) 451 totalMemoryReleased := uc.memoryReleased 452 uc.mu.Unlock() 453 454 // If there are pieces available, add the standby workers to collect them. 455 // Standby workers are only added to the chunk when piecesAvailable is equal 456 // to zero, meaning this code will only trigger if the number of pieces 457 // available increases from zero. That can only happen if a worker 458 // experiences an error during upload. 459 if piecesAvailable > 0 { 460 uc.managedNotifyStandbyWorkers() 461 } 462 // If required, remove the chunk from the set of repairing chunks. 463 if chunkComplete && !released { 464 r.managedUpdateUploadChunkStuckStatus(uc) 465 // Close the file entry unless disrupted. 466 if !r.deps.Disrupt("disableCloseUploadEntry") { 467 err := uc.fileEntry.Close() 468 if err != nil { 469 r.log.Debugf("WARN: file not closed after chunk upload complete: %v %v", r.staticFileSet.SiaPath(uc.fileEntry), err) 470 } 471 } 472 // Remove the chunk from the repairingChunks map 473 r.uploadHeap.managedMarkRepairDone(uc.id) 474 // Signal garbage collector to free memory before returning it to the manager. 475 uc.logicalChunkData = nil 476 uc.physicalChunkData = nil 477 } 478 // If required, return the memory to the renter. 479 if memoryReleased > 0 { 480 r.memoryManager.Return(memoryReleased) 481 } 482 // Sanity check - all memory should be released if the chunk is complete. 483 if chunkComplete && totalMemoryReleased != uc.memoryNeeded { 484 r.log.Critical("No workers remaining, but not all memory released:", uc.workersRemaining, uc.piecesRegistered, uc.memoryReleased, uc.memoryNeeded) 485 } 486 } 487 488 // managedSetStuckAndClose sets the unfinishedUploadChunk's stuck status, 489 // triggers threadedBubble to update the directory, and then closes the 490 // fileEntry 491 func (r *Renter) managedSetStuckAndClose(uc *unfinishedUploadChunk, stuck bool) error { 492 // Update chunk stuck status 493 err := uc.fileEntry.SetStuck(uc.index, stuck) 494 if err != nil { 495 return fmt.Errorf("WARN: unable to update chunk stuck status for file %v: %v", r.staticFileSet.SiaPath(uc.fileEntry), err) 496 } 497 // Close SiaFile 498 err = uc.fileEntry.Close() 499 if err != nil { 500 return fmt.Errorf("WARN: unable to close siafile %v", r.staticFileSet.SiaPath(uc.fileEntry)) 501 } 502 // Signal garbage collector to free memory. 503 uc.physicalChunkData = nil 504 uc.logicalChunkData = nil 505 return nil 506 } 507 508 // managedUpdateUploadChunkStuckStatus checks to see if the repair was 509 // successful and then updates the chunk's stuck status 510 func (r *Renter) managedUpdateUploadChunkStuckStatus(uc *unfinishedUploadChunk) { 511 // Grab necessary information from upload chunk under lock 512 uc.mu.Lock() 513 index := uc.id.index 514 stuck := uc.stuck 515 minimumPieces := uc.minimumPieces 516 piecesCompleted := uc.piecesCompleted 517 piecesNeeded := uc.piecesNeeded 518 stuckRepair := uc.stuckRepair 519 uc.mu.Unlock() 520 521 // Determine if repair was successful. 522 successfulRepair := float64(piecesNeeded-piecesCompleted)/float64(piecesNeeded-minimumPieces) < RepairThreshold 523 524 // Check if renter is shutting down 525 var renterError bool 526 select { 527 case <-r.tg.StopChan(): 528 renterError = true 529 default: 530 // Check that the renter is still online 531 if !r.g.Online() { 532 renterError = true 533 } 534 } 535 536 // If the repair was unsuccessful and there was a renter error then return 537 if !successfulRepair && renterError { 538 r.log.Debugln("WARN: repair unsuccessful for chunk", uc.id, "due to an error with the renter") 539 return 540 } 541 // Log if the repair was unsuccessful 542 if !successfulRepair { 543 r.log.Debugln("WARN: repair unsuccessful, marking chunk", uc.id, "as stuck", float64(piecesCompleted)/float64(piecesNeeded)) 544 } else { 545 r.log.Debugln("SUCCESS: repair successful, marking chunk as non-stuck:", uc.id) 546 } 547 // Update chunk stuck status 548 if err := uc.fileEntry.SetStuck(index, !successfulRepair); err != nil { 549 r.log.Printf("WARN: could not set chunk %v stuck status for file %v: %v", uc.id, uc.fileEntry.SiaFilePath(), err) 550 } 551 552 // Check to see if the chunk was stuck and now is successfully repaired by 553 // the stuck loop 554 if stuck && successfulRepair && stuckRepair { 555 r.log.Debugln("Stuck chunk", uc.id, "successfully repaired") 556 // Add file to the successful stuck repair stack if there are still 557 // stuck chunks to repair 558 if uc.fileEntry.NumStuckChunks() > 0 { 559 r.stuckStack.managedPush(r.staticFileSet.SiaPath(uc.fileEntry)) 560 } 561 // Signal the stuck loop that the chunk was successfully repaired 562 select { 563 case <-r.tg.StopChan(): 564 r.log.Debugln("WARN: renter shut down before the stuck loop was signalled that the stuck repair was successful") 565 return 566 case r.uploadHeap.stuckChunkSuccess <- struct{}{}: 567 default: 568 } 569 } 570 }