storj.io/uplink@v1.13.0/private/eestream/stripe.go (about) 1 // Copyright (C) 2023 Storj Labs, Inc. 2 // See LICENSE for copying information. 3 4 package eestream 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "io" 11 "sort" 12 "strings" 13 "sync" 14 "sync/atomic" 15 "time" 16 17 "github.com/spacemonkeygo/monkit/v3" 18 "golang.org/x/exp/slices" 19 20 "storj.io/common/rpc/rpctracing" 21 "storj.io/common/sync2" 22 "storj.io/infectious" 23 ) 24 25 const ( 26 debugEnabled = false 27 maxStripesAhead = 256 // might be interesting to test different values later 28 quiescentCheckInterval = time.Second 29 quiescentIntervalTrigger = 5 // number of quiescent check intervals before triggering 30 ) 31 32 // pieceReader represents the stream of shares within one piece. 33 type pieceReader struct { 34 shareNum int 35 source io.Reader 36 sourceCloser io.Closer 37 buffer *StreamingPiece 38 39 backpressureMu sync.Mutex 40 backpressure sync.Cond 41 completedShares int 42 } 43 44 // StripeReader reads from a collection of piece io.ReadClosers in parallel, 45 // recombining them into a single stream using an ErasureScheme. 46 type StripeReader struct { 47 bundy *PiecesProgress 48 pieces []pieceReader 49 scheme ErasureScheme 50 wg sync.WaitGroup 51 stripeReady sync2.Event 52 returnedStripes int32 53 totalStripes int32 54 errorDetection bool 55 runningPieces atomic.Int32 56 quiescent atomic.Bool 57 } 58 59 // NewStripeReader makes a new StripeReader using the provided map of share 60 // number to io.ReadClosers, an ErasureScheme, the total number of stripes in 61 // the stream, and whether or not to use the Erasure Scheme's error detection. 62 func NewStripeReader(readers map[int]io.ReadCloser, scheme ErasureScheme, totalStripes int, 63 errorDetection bool) *StripeReader { 64 65 pool := NewBatchPool(scheme.ErasureShareSize()) 66 67 totalPieceSize := int64(totalStripes) * int64(scheme.ErasureShareSize()) 68 69 pieces := make([]pieceReader, 0, len(readers)) 70 for shareNum, source := range readers { 71 pieces = append(pieces, pieceReader{ 72 shareNum: shareNum, 73 source: io.LimitReader(source, totalPieceSize), 74 sourceCloser: source, 75 buffer: NewStreamingPiece(scheme.ErasureShareSize(), totalPieceSize, pool), 76 }) 77 piece := &pieces[len(pieces)-1] 78 piece.backpressure.L = &piece.backpressureMu 79 } 80 81 minimum := int32(scheme.RequiredCount()) 82 if errorDetection && minimum < int32(len(pieces)) { 83 minimum++ 84 } 85 86 s := &StripeReader{ 87 bundy: NewPiecesProgress(minimum, int32(len(pieces))), 88 pieces: pieces, 89 scheme: scheme, 90 totalStripes: int32(totalStripes), 91 errorDetection: errorDetection, 92 } 93 s.start() 94 return s 95 } 96 97 // start creates the goroutines to start reading each of the share streams. 98 func (s *StripeReader) start() { 99 if debugEnabled { 100 fmt.Println("starting", len(s.pieces), "readers") 101 } 102 103 var pwg sync.WaitGroup 104 s.runningPieces.Store(int32(len(s.pieces))) 105 106 for idx := range s.pieces { 107 s.wg.Add(1) 108 pwg.Add(1) 109 go func(idx int) { 110 defer s.wg.Done() 111 defer pwg.Done() 112 113 // whenever a share reader is done, we should wake up the core in case 114 // this share reader just exited unsuccessfully and this represents a 115 // failure to get enough pieces. 116 defer s.stripeReady.Signal() 117 118 // we should mark that there is one less running share reader. 119 defer s.runningPieces.Add(-1) 120 121 // do the work. 122 s.readShares(idx) 123 }(idx) 124 } 125 126 done := make(chan struct{}) 127 go func() { 128 pwg.Wait() 129 close(done) 130 }() 131 132 s.wg.Add(1) 133 go func() { 134 defer s.wg.Done() 135 136 s1 := s.bundy.ProgressSnapshot(nil) 137 var s2 []int32 138 139 t := time.NewTicker(quiescentCheckInterval) 140 defer t.Stop() 141 142 match := 0 143 for { 144 select { 145 case <-t.C: 146 s2 = s.bundy.ProgressSnapshot(s2[:0]) 147 148 if !slices.Equal(s1, s2) { 149 match = 0 150 s2, s1 = s1, s2 151 continue 152 } 153 154 match++ 155 if match == quiescentIntervalTrigger { 156 s.quiescent.Store(true) 157 s.stripeReady.Signal() 158 return 159 } 160 161 case <-done: 162 return 163 } 164 } 165 }() 166 } 167 168 // readShares is the method that does the actual work of reading an individual 169 // share stream. 170 func (s *StripeReader) readShares(idx int) { 171 r := &s.pieces[idx] 172 stripesSoFar := 0 173 for { 174 // see if we can fill this index's buffer with data from r.source. 175 shares, done := r.buffer.ReadSharesFrom(r.source) 176 177 // did we get any shares? 178 if shares > 0 { 179 // yay! 180 stripesSoFar += shares 181 if debugEnabled { 182 fmt.Println(idx, "read", shares, "shares") 183 } 184 // tell the bundy clock 185 if s.bundy.SharesCompleted(idx, int32(shares)) { 186 // oh hey, bundy says we just changed the situation and we should wake 187 // up the core. 188 if debugEnabled { 189 fmt.Println(idx, "bundy counter says", shares, "is ready") 190 } 191 s.stripeReady.Signal() 192 } 193 } else if debugEnabled { 194 fmt.Println(idx, "read 0 shares?") 195 } 196 197 // will we get any more shares? 198 if done { 199 if debugEnabled { 200 fmt.Println(idx, "done") 201 } 202 break 203 } 204 205 r.backpressure.L.Lock() 206 // how far ahead are we? are we too far ahead of the core? if so, let's 207 // wait. the core will mark us completed if things are closing. 208 for stripesSoFar > r.completedShares+maxStripesAhead && 209 r.completedShares < int(s.totalStripes) { 210 r.backpressure.Wait() 211 } 212 r.backpressure.L.Unlock() 213 } 214 } 215 216 // markCompleted updates the pieceReader's accounting of how far ahead it 217 // is from the core, and also tells the *StreamingPiece whether it can free up some 218 // internal buffers. 219 func (r *pieceReader) markCompleted(stripes int) { 220 r.backpressure.L.Lock() 221 defer r.backpressure.L.Unlock() 222 r.buffer.MarkCompleted(stripes) 223 if stripes > r.completedShares { 224 r.completedShares = stripes 225 } 226 // the pieceReader might be asleep. let's wake it up. 227 r.backpressure.Signal() 228 } 229 230 // Close does *not* close the readers it received in the constructor. 231 // Close does *not* wait for reader goroutines to shut down. See CloseAndWait 232 // if you want other behavior. Close mimics the older eestream.StripeReader 233 // behavior. 234 func (s *StripeReader) Close() error { 235 for idx := range s.pieces { 236 s.wg.Add(1) 237 go func(idx int) { 238 defer s.wg.Done() 239 r := &s.pieces[idx] 240 r.markCompleted(int(s.totalStripes)) 241 }(idx) 242 } 243 return nil 244 } 245 246 // CloseAndWait closes all readers and waits for all goroutines. 247 func (s *StripeReader) CloseAndWait() error { 248 for idx := range s.pieces { 249 s.wg.Add(1) 250 go func(idx int) { 251 defer s.wg.Done() 252 r := &s.pieces[idx] 253 _ = r.sourceCloser.Close() 254 r.markCompleted(int(s.totalStripes)) 255 }(idx) 256 } 257 s.wg.Wait() 258 return nil 259 } 260 261 func (s *StripeReader) combineErrs() error { 262 var errstrings []string 263 for idx := range s.pieces { 264 if err := s.pieces[idx].buffer.Err(); err != nil && !errors.Is(err, io.EOF) { 265 errstrings = append(errstrings, fmt.Sprintf("\nerror retrieving piece %02d: %v", s.pieces[idx].shareNum, err)) 266 } 267 } 268 if len(errstrings) > 0 { 269 sort.Strings(errstrings) 270 return Error.New("failed to download segment: %s", strings.Join(errstrings, "")) 271 } 272 return Error.New("programmer error: no errors to combine") 273 } 274 275 var backcompatMon = monkit.ScopeNamed("storj.io/storj/uplink/eestream") 276 var monReadStripeTask = mon.Task() 277 278 // ReadStripes returns 1 or more stripes. out is overwritten. 279 func (s *StripeReader) ReadStripes(ctx context.Context, nextStripe int64, out []byte) (_ []byte, count int, err error) { 280 defer monReadStripeTask(&ctx)(&err) 281 ctx = rpctracing.WithoutDistributedTracing(ctx) 282 283 if nextStripe != int64(s.returnedStripes) { 284 return nil, 0, Error.New("unexpected next stripe") 285 } 286 287 // first, some memory management. do we have a place to write the results, 288 // and how many stripes can we write? 289 if cap(out) <= 0 { 290 out = make([]byte, 0, globalBufSize) 291 } 292 maxStripes := int32(cap(out) / s.scheme.StripeSize()) 293 if debugEnabled { 294 fmt.Println("core initial stripe calc", maxStripes, s.returnedStripes, s.totalStripes) 295 } 296 if s.returnedStripes+maxStripes > s.totalStripes { 297 maxStripes = s.totalStripes - s.returnedStripes 298 } 299 if maxStripes <= 0 { 300 return nil, 0, io.EOF 301 } 302 303 if debugEnabled { 304 fmt.Println("core downloading", maxStripes, "at stripe size", s.scheme.StripeSize(), "with cap", cap(out)) 305 } 306 307 // okay, let's tell the bundy clock we just want one new stripe. hopefully 308 // we get more than just 1. 309 requiredWatermark := s.returnedStripes + 1 310 s.bundy.SetStripesNeeded(requiredWatermark) 311 312 // if the bundy clock wakes up, we're going to find the lowest watermark 313 // with the neededShares number of shares per stripe. since we're essentially doing 314 // a min operation, let's start stripesFound at the highest value we want it, 315 // and we will lower it as we inspect the pieceSharesReceived on the bundy clock. 316 stripesFound := s.returnedStripes + maxStripes 317 318 ready := make([]int, 0, len(s.pieces)) 319 320 for { 321 // check if we were woken from quiescence. if so, error out. 322 if s.quiescent.Load() { 323 return nil, 0, QuiescentError.New("") 324 } 325 326 // okay let's tell the bundy clock we're awake and it should be okay to 327 // wake us up again next time we sleep. 328 s.bundy.AcknowledgeNewStripes() 329 330 // let's also load the number of running pieces first before we go evaluate 331 // their work to avoid a race. 332 runningPieces := s.runningPieces.Load() 333 334 // see how many are ready 335 ready = ready[:0] 336 for idx := range s.pieces { 337 watermark := s.bundy.PieceSharesReceived(idx) 338 if watermark >= requiredWatermark { 339 ready = append(ready, idx) 340 if watermark < stripesFound { 341 // keep stripesFound at the smallest watermark 342 stripesFound = watermark 343 } 344 } 345 } 346 if debugEnabled { 347 fmt.Println("core found", len(ready), "ready") 348 } 349 350 // how many were ready? if we cleared the current neededShares, we can break 351 // out of our condition variable for loop 352 if int32(len(ready)) >= s.bundy.NeededShares() { 353 if debugEnabled { 354 fmt.Println("core bundy says that's enough. hooray") 355 } 356 // hooray! 357 break 358 } 359 360 // not enough ready. 361 // okay, were there enough running share readers at the start still so that 362 // we could potentially still have enough ready in the future? 363 if runningPieces+int32(len(ready)) < s.bundy.NeededShares() { 364 // nope. we need to give up. 365 backcompatMon.Meter("download_stripe_failed_not_enough_pieces_uplink").Mark(1) //mon:locked 366 return nil, 0, s.combineErrs() 367 } 368 369 if debugEnabled { 370 fmt.Println("core", len(ready), "ready not enough for", s.bundy.NeededShares(), ", sleeping") 371 } 372 373 // let's wait for the bundy clock to tell a share reader to wake us up. 374 if !s.stripeReady.Wait(ctx) { 375 return nil, 0, ctx.Err() 376 } 377 } 378 379 // okay, we have a enough share readers ready. 380 381 // some pre-allocated working memory for erasure share calls. 382 fecShares := make([]infectious.Share, 0, len(ready)) 383 384 // we're going to loop through the stripesFound - s.returnedStripes new 385 // stripes we have available. 386 for stripe := int(s.returnedStripes); stripe < int(stripesFound); stripe++ { 387 stripeOffset := (stripe - int(s.returnedStripes)) * s.scheme.StripeSize() 388 if debugEnabled { 389 fmt.Println("core piecing together stripe", stripe, "and writing at offset", stripeOffset) 390 } 391 392 outslice := out[stripeOffset : stripeOffset+s.scheme.StripeSize()] 393 394 fecShares = fecShares[:0] 395 var releases []func() 396 397 for _, idx := range ready { 398 data, release, err := s.pieces[idx].buffer.ReadShare(stripe) 399 if err != nil { 400 return nil, 0, Error.New("unexpected error: %w", err) 401 } 402 releases = append(releases, release) 403 fecShares = append(fecShares, infectious.Share{ 404 Number: s.pieces[idx].shareNum, 405 Data: data}) 406 } 407 408 if s.errorDetection { 409 _, err = s.scheme.Decode(outslice, fecShares) 410 } else { 411 err = s.scheme.Rebuild(fecShares, func(r infectious.Share) { 412 copy(outslice[r.Number*len(r.Data):(r.Number+1)*len(r.Data)], r.Data) 413 }) 414 } 415 416 for _, release := range releases { 417 release() 418 } 419 420 if err != nil { 421 if needsMoreShares(err) { 422 if s.bundy.IncreaseNeededShares() { 423 // just start over now 424 return s.ReadStripes(ctx, nextStripe, out) 425 } 426 } 427 return nil, 0, Error.New("error decoding data: %w", err) 428 } 429 } 430 431 // okay, we're about to say we got a bunch of shares, so let's tell all the 432 // share readers to raise their watermark of what's done. 433 for idx := range s.pieces { 434 s.pieces[idx].markCompleted(int(stripesFound)) 435 } 436 437 stripes := stripesFound - s.returnedStripes 438 s.returnedStripes = stripesFound 439 440 if debugEnabled { 441 fmt.Println("core returned", int(stripes)*s.scheme.StripeSize(), "bytes and", stripes, "stripes") 442 } 443 444 return out[:int(stripes)*s.scheme.StripeSize()], int(stripes), nil 445 } 446 447 func needsMoreShares(err error) bool { 448 return errors.Is(err, infectious.NotEnoughShares) || 449 errors.Is(err, infectious.TooManyErrors) 450 }