storj.io/uplink@v1.13.0/private/eestream/stripe.go (about)

     1  // Copyright (C) 2023 Storj Labs, Inc.
     2  // See LICENSE for copying information.
     3  
     4  package eestream
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"sort"
    12  	"strings"
    13  	"sync"
    14  	"sync/atomic"
    15  	"time"
    16  
    17  	"github.com/spacemonkeygo/monkit/v3"
    18  	"golang.org/x/exp/slices"
    19  
    20  	"storj.io/common/rpc/rpctracing"
    21  	"storj.io/common/sync2"
    22  	"storj.io/infectious"
    23  )
    24  
    25  const (
    26  	debugEnabled             = false
    27  	maxStripesAhead          = 256 // might be interesting to test different values later
    28  	quiescentCheckInterval   = time.Second
    29  	quiescentIntervalTrigger = 5 // number of quiescent check intervals before triggering
    30  )
    31  
    32  // pieceReader represents the stream of shares within one piece.
    33  type pieceReader struct {
    34  	shareNum     int
    35  	source       io.Reader
    36  	sourceCloser io.Closer
    37  	buffer       *StreamingPiece
    38  
    39  	backpressureMu  sync.Mutex
    40  	backpressure    sync.Cond
    41  	completedShares int
    42  }
    43  
    44  // StripeReader reads from a collection of piece io.ReadClosers in parallel,
    45  // recombining them into a single stream using an ErasureScheme.
    46  type StripeReader struct {
    47  	bundy           *PiecesProgress
    48  	pieces          []pieceReader
    49  	scheme          ErasureScheme
    50  	wg              sync.WaitGroup
    51  	stripeReady     sync2.Event
    52  	returnedStripes int32
    53  	totalStripes    int32
    54  	errorDetection  bool
    55  	runningPieces   atomic.Int32
    56  	quiescent       atomic.Bool
    57  }
    58  
    59  // NewStripeReader makes a new StripeReader using the provided map of share
    60  // number to io.ReadClosers, an ErasureScheme, the total number of stripes in
    61  // the stream, and whether or not to use the Erasure Scheme's error detection.
    62  func NewStripeReader(readers map[int]io.ReadCloser, scheme ErasureScheme, totalStripes int,
    63  	errorDetection bool) *StripeReader {
    64  
    65  	pool := NewBatchPool(scheme.ErasureShareSize())
    66  
    67  	totalPieceSize := int64(totalStripes) * int64(scheme.ErasureShareSize())
    68  
    69  	pieces := make([]pieceReader, 0, len(readers))
    70  	for shareNum, source := range readers {
    71  		pieces = append(pieces, pieceReader{
    72  			shareNum:     shareNum,
    73  			source:       io.LimitReader(source, totalPieceSize),
    74  			sourceCloser: source,
    75  			buffer:       NewStreamingPiece(scheme.ErasureShareSize(), totalPieceSize, pool),
    76  		})
    77  		piece := &pieces[len(pieces)-1]
    78  		piece.backpressure.L = &piece.backpressureMu
    79  	}
    80  
    81  	minimum := int32(scheme.RequiredCount())
    82  	if errorDetection && minimum < int32(len(pieces)) {
    83  		minimum++
    84  	}
    85  
    86  	s := &StripeReader{
    87  		bundy:          NewPiecesProgress(minimum, int32(len(pieces))),
    88  		pieces:         pieces,
    89  		scheme:         scheme,
    90  		totalStripes:   int32(totalStripes),
    91  		errorDetection: errorDetection,
    92  	}
    93  	s.start()
    94  	return s
    95  }
    96  
    97  // start creates the goroutines to start reading each of the share streams.
    98  func (s *StripeReader) start() {
    99  	if debugEnabled {
   100  		fmt.Println("starting", len(s.pieces), "readers")
   101  	}
   102  
   103  	var pwg sync.WaitGroup
   104  	s.runningPieces.Store(int32(len(s.pieces)))
   105  
   106  	for idx := range s.pieces {
   107  		s.wg.Add(1)
   108  		pwg.Add(1)
   109  		go func(idx int) {
   110  			defer s.wg.Done()
   111  			defer pwg.Done()
   112  
   113  			// whenever a share reader is done, we should wake up the core in case
   114  			// this share reader just exited unsuccessfully and this represents a
   115  			// failure to get enough pieces.
   116  			defer s.stripeReady.Signal()
   117  
   118  			// we should mark that there is one less running share reader.
   119  			defer s.runningPieces.Add(-1)
   120  
   121  			// do the work.
   122  			s.readShares(idx)
   123  		}(idx)
   124  	}
   125  
   126  	done := make(chan struct{})
   127  	go func() {
   128  		pwg.Wait()
   129  		close(done)
   130  	}()
   131  
   132  	s.wg.Add(1)
   133  	go func() {
   134  		defer s.wg.Done()
   135  
   136  		s1 := s.bundy.ProgressSnapshot(nil)
   137  		var s2 []int32
   138  
   139  		t := time.NewTicker(quiescentCheckInterval)
   140  		defer t.Stop()
   141  
   142  		match := 0
   143  		for {
   144  			select {
   145  			case <-t.C:
   146  				s2 = s.bundy.ProgressSnapshot(s2[:0])
   147  
   148  				if !slices.Equal(s1, s2) {
   149  					match = 0
   150  					s2, s1 = s1, s2
   151  					continue
   152  				}
   153  
   154  				match++
   155  				if match == quiescentIntervalTrigger {
   156  					s.quiescent.Store(true)
   157  					s.stripeReady.Signal()
   158  					return
   159  				}
   160  
   161  			case <-done:
   162  				return
   163  			}
   164  		}
   165  	}()
   166  }
   167  
   168  // readShares is the method that does the actual work of reading an individual
   169  // share stream.
   170  func (s *StripeReader) readShares(idx int) {
   171  	r := &s.pieces[idx]
   172  	stripesSoFar := 0
   173  	for {
   174  		// see if we can fill this index's buffer with data from r.source.
   175  		shares, done := r.buffer.ReadSharesFrom(r.source)
   176  
   177  		// did we get any shares?
   178  		if shares > 0 {
   179  			// yay!
   180  			stripesSoFar += shares
   181  			if debugEnabled {
   182  				fmt.Println(idx, "read", shares, "shares")
   183  			}
   184  			// tell the bundy clock
   185  			if s.bundy.SharesCompleted(idx, int32(shares)) {
   186  				// oh hey, bundy says we just changed the situation and we should wake
   187  				// up the core.
   188  				if debugEnabled {
   189  					fmt.Println(idx, "bundy counter says", shares, "is ready")
   190  				}
   191  				s.stripeReady.Signal()
   192  			}
   193  		} else if debugEnabled {
   194  			fmt.Println(idx, "read 0 shares?")
   195  		}
   196  
   197  		// will we get any more shares?
   198  		if done {
   199  			if debugEnabled {
   200  				fmt.Println(idx, "done")
   201  			}
   202  			break
   203  		}
   204  
   205  		r.backpressure.L.Lock()
   206  		// how far ahead are we? are we too far ahead of the core? if so, let's
   207  		// wait. the core will mark us completed if things are closing.
   208  		for stripesSoFar > r.completedShares+maxStripesAhead &&
   209  			r.completedShares < int(s.totalStripes) {
   210  			r.backpressure.Wait()
   211  		}
   212  		r.backpressure.L.Unlock()
   213  	}
   214  }
   215  
   216  // markCompleted updates the pieceReader's accounting of how far ahead it
   217  // is from the core, and also tells the *StreamingPiece whether it can free up some
   218  // internal buffers.
   219  func (r *pieceReader) markCompleted(stripes int) {
   220  	r.backpressure.L.Lock()
   221  	defer r.backpressure.L.Unlock()
   222  	r.buffer.MarkCompleted(stripes)
   223  	if stripes > r.completedShares {
   224  		r.completedShares = stripes
   225  	}
   226  	// the pieceReader might be asleep. let's wake it up.
   227  	r.backpressure.Signal()
   228  }
   229  
   230  // Close does *not* close the readers it received in the constructor.
   231  // Close does *not* wait for reader goroutines to shut down. See CloseAndWait
   232  // if you want other behavior. Close mimics the older eestream.StripeReader
   233  // behavior.
   234  func (s *StripeReader) Close() error {
   235  	for idx := range s.pieces {
   236  		s.wg.Add(1)
   237  		go func(idx int) {
   238  			defer s.wg.Done()
   239  			r := &s.pieces[idx]
   240  			r.markCompleted(int(s.totalStripes))
   241  		}(idx)
   242  	}
   243  	return nil
   244  }
   245  
   246  // CloseAndWait closes all readers and waits for all goroutines.
   247  func (s *StripeReader) CloseAndWait() error {
   248  	for idx := range s.pieces {
   249  		s.wg.Add(1)
   250  		go func(idx int) {
   251  			defer s.wg.Done()
   252  			r := &s.pieces[idx]
   253  			_ = r.sourceCloser.Close()
   254  			r.markCompleted(int(s.totalStripes))
   255  		}(idx)
   256  	}
   257  	s.wg.Wait()
   258  	return nil
   259  }
   260  
   261  func (s *StripeReader) combineErrs() error {
   262  	var errstrings []string
   263  	for idx := range s.pieces {
   264  		if err := s.pieces[idx].buffer.Err(); err != nil && !errors.Is(err, io.EOF) {
   265  			errstrings = append(errstrings, fmt.Sprintf("\nerror retrieving piece %02d: %v", s.pieces[idx].shareNum, err))
   266  		}
   267  	}
   268  	if len(errstrings) > 0 {
   269  		sort.Strings(errstrings)
   270  		return Error.New("failed to download segment: %s", strings.Join(errstrings, ""))
   271  	}
   272  	return Error.New("programmer error: no errors to combine")
   273  }
   274  
   275  var backcompatMon = monkit.ScopeNamed("storj.io/storj/uplink/eestream")
   276  var monReadStripeTask = mon.Task()
   277  
   278  // ReadStripes returns 1 or more stripes. out is overwritten.
   279  func (s *StripeReader) ReadStripes(ctx context.Context, nextStripe int64, out []byte) (_ []byte, count int, err error) {
   280  	defer monReadStripeTask(&ctx)(&err)
   281  	ctx = rpctracing.WithoutDistributedTracing(ctx)
   282  
   283  	if nextStripe != int64(s.returnedStripes) {
   284  		return nil, 0, Error.New("unexpected next stripe")
   285  	}
   286  
   287  	// first, some memory management. do we have a place to write the results,
   288  	// and how many stripes can we write?
   289  	if cap(out) <= 0 {
   290  		out = make([]byte, 0, globalBufSize)
   291  	}
   292  	maxStripes := int32(cap(out) / s.scheme.StripeSize())
   293  	if debugEnabled {
   294  		fmt.Println("core initial stripe calc", maxStripes, s.returnedStripes, s.totalStripes)
   295  	}
   296  	if s.returnedStripes+maxStripes > s.totalStripes {
   297  		maxStripes = s.totalStripes - s.returnedStripes
   298  	}
   299  	if maxStripes <= 0 {
   300  		return nil, 0, io.EOF
   301  	}
   302  
   303  	if debugEnabled {
   304  		fmt.Println("core downloading", maxStripes, "at stripe size", s.scheme.StripeSize(), "with cap", cap(out))
   305  	}
   306  
   307  	// okay, let's tell the bundy clock we just want one new stripe. hopefully
   308  	// we get more than just 1.
   309  	requiredWatermark := s.returnedStripes + 1
   310  	s.bundy.SetStripesNeeded(requiredWatermark)
   311  
   312  	// if the bundy clock wakes up, we're going to find the lowest watermark
   313  	// with the neededShares number of shares per stripe. since we're essentially doing
   314  	// a min operation, let's start stripesFound at the highest value we want it,
   315  	// and we will lower it as we inspect the pieceSharesReceived on the bundy clock.
   316  	stripesFound := s.returnedStripes + maxStripes
   317  
   318  	ready := make([]int, 0, len(s.pieces))
   319  
   320  	for {
   321  		// check if we were woken from quiescence. if so, error out.
   322  		if s.quiescent.Load() {
   323  			return nil, 0, QuiescentError.New("")
   324  		}
   325  
   326  		// okay let's tell the bundy clock we're awake and it should be okay to
   327  		// wake us up again next time we sleep.
   328  		s.bundy.AcknowledgeNewStripes()
   329  
   330  		// let's also load the number of running pieces first before we go evaluate
   331  		// their work to avoid a race.
   332  		runningPieces := s.runningPieces.Load()
   333  
   334  		// see how many are ready
   335  		ready = ready[:0]
   336  		for idx := range s.pieces {
   337  			watermark := s.bundy.PieceSharesReceived(idx)
   338  			if watermark >= requiredWatermark {
   339  				ready = append(ready, idx)
   340  				if watermark < stripesFound {
   341  					// keep stripesFound at the smallest watermark
   342  					stripesFound = watermark
   343  				}
   344  			}
   345  		}
   346  		if debugEnabled {
   347  			fmt.Println("core found", len(ready), "ready")
   348  		}
   349  
   350  		// how many were ready? if we cleared the current neededShares, we can break
   351  		// out of our condition variable for loop
   352  		if int32(len(ready)) >= s.bundy.NeededShares() {
   353  			if debugEnabled {
   354  				fmt.Println("core bundy says that's enough. hooray")
   355  			}
   356  			// hooray!
   357  			break
   358  		}
   359  
   360  		// not enough ready.
   361  		// okay, were there enough running share readers at the start still so that
   362  		// we could potentially still have enough ready in the future?
   363  		if runningPieces+int32(len(ready)) < s.bundy.NeededShares() {
   364  			// nope. we need to give up.
   365  			backcompatMon.Meter("download_stripe_failed_not_enough_pieces_uplink").Mark(1) //mon:locked
   366  			return nil, 0, s.combineErrs()
   367  		}
   368  
   369  		if debugEnabled {
   370  			fmt.Println("core", len(ready), "ready not enough for", s.bundy.NeededShares(), ", sleeping")
   371  		}
   372  
   373  		// let's wait for the bundy clock to tell a share reader to wake us up.
   374  		if !s.stripeReady.Wait(ctx) {
   375  			return nil, 0, ctx.Err()
   376  		}
   377  	}
   378  
   379  	// okay, we have a enough share readers ready.
   380  
   381  	// some pre-allocated working memory for erasure share calls.
   382  	fecShares := make([]infectious.Share, 0, len(ready))
   383  
   384  	// we're going to loop through the stripesFound - s.returnedStripes new
   385  	// stripes we have available.
   386  	for stripe := int(s.returnedStripes); stripe < int(stripesFound); stripe++ {
   387  		stripeOffset := (stripe - int(s.returnedStripes)) * s.scheme.StripeSize()
   388  		if debugEnabled {
   389  			fmt.Println("core piecing together stripe", stripe, "and writing at offset", stripeOffset)
   390  		}
   391  
   392  		outslice := out[stripeOffset : stripeOffset+s.scheme.StripeSize()]
   393  
   394  		fecShares = fecShares[:0]
   395  		var releases []func()
   396  
   397  		for _, idx := range ready {
   398  			data, release, err := s.pieces[idx].buffer.ReadShare(stripe)
   399  			if err != nil {
   400  				return nil, 0, Error.New("unexpected error: %w", err)
   401  			}
   402  			releases = append(releases, release)
   403  			fecShares = append(fecShares, infectious.Share{
   404  				Number: s.pieces[idx].shareNum,
   405  				Data:   data})
   406  		}
   407  
   408  		if s.errorDetection {
   409  			_, err = s.scheme.Decode(outslice, fecShares)
   410  		} else {
   411  			err = s.scheme.Rebuild(fecShares, func(r infectious.Share) {
   412  				copy(outslice[r.Number*len(r.Data):(r.Number+1)*len(r.Data)], r.Data)
   413  			})
   414  		}
   415  
   416  		for _, release := range releases {
   417  			release()
   418  		}
   419  
   420  		if err != nil {
   421  			if needsMoreShares(err) {
   422  				if s.bundy.IncreaseNeededShares() {
   423  					// just start over now
   424  					return s.ReadStripes(ctx, nextStripe, out)
   425  				}
   426  			}
   427  			return nil, 0, Error.New("error decoding data: %w", err)
   428  		}
   429  	}
   430  
   431  	// okay, we're about to say we got a bunch of shares, so let's tell all the
   432  	// share readers to raise their watermark of what's done.
   433  	for idx := range s.pieces {
   434  		s.pieces[idx].markCompleted(int(stripesFound))
   435  	}
   436  
   437  	stripes := stripesFound - s.returnedStripes
   438  	s.returnedStripes = stripesFound
   439  
   440  	if debugEnabled {
   441  		fmt.Println("core returned", int(stripes)*s.scheme.StripeSize(), "bytes and", stripes, "stripes")
   442  	}
   443  
   444  	return out[:int(stripes)*s.scheme.StripeSize()], int(stripes), nil
   445  }
   446  
   447  func needsMoreShares(err error) bool {
   448  	return errors.Is(err, infectious.NotEnoughShares) ||
   449  		errors.Is(err, infectious.TooManyErrors)
   450  }