github.com/celestiaorg/celestia-node@v0.15.0-beta.1/share/eds/retriever.go (about)

     1  package eds
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"sync"
     7  	"sync/atomic"
     8  	"time"
     9  
    10  	"github.com/ipfs/boxo/blockservice"
    11  	"github.com/ipfs/go-cid"
    12  	logging "github.com/ipfs/go-log/v2"
    13  	"go.opentelemetry.io/otel"
    14  	"go.opentelemetry.io/otel/attribute"
    15  	"go.opentelemetry.io/otel/codes"
    16  	"go.opentelemetry.io/otel/trace"
    17  
    18  	"github.com/celestiaorg/celestia-app/pkg/da"
    19  	"github.com/celestiaorg/celestia-app/pkg/wrapper"
    20  	"github.com/celestiaorg/nmt"
    21  	"github.com/celestiaorg/rsmt2d"
    22  
    23  	"github.com/celestiaorg/celestia-node/share"
    24  	"github.com/celestiaorg/celestia-node/share/eds/byzantine"
    25  	"github.com/celestiaorg/celestia-node/share/ipld"
    26  )
    27  
    28  var (
    29  	log    = logging.Logger("share/eds")
    30  	tracer = otel.Tracer("share/eds")
    31  )
    32  
    33  // Retriever retrieves rsmt2d.ExtendedDataSquares from the IPLD network.
    34  // Instead of requesting data 'share by share' it requests data by quadrants
    35  // minimizing bandwidth usage in the happy cases.
    36  //
    37  //	 ---- ----
    38  //	| 0  | 1  |
    39  //	 ---- ----
    40  //	| 2  | 3  |
    41  //	 ---- ----
    42  //
    43  // Retriever randomly picks one of the data square quadrants and tries to request them one by one
    44  // until it is able to reconstruct the whole square.
    45  type Retriever struct {
    46  	bServ blockservice.BlockService
    47  }
    48  
    49  // NewRetriever creates a new instance of the Retriever over IPLD BlockService and rmst2d.Codec
    50  func NewRetriever(bServ blockservice.BlockService) *Retriever {
    51  	return &Retriever{bServ: bServ}
    52  }
    53  
    54  // Retrieve retrieves all the data committed to DataAvailabilityHeader.
    55  //
    56  // If not available locally, it aims to request from the network only one quadrant (1/4) of the
    57  // data square and reconstructs the other three quadrants (3/4). If the requested quadrant is not
    58  // available within RetrieveQuadrantTimeout, it starts requesting another quadrant until either the
    59  // data is reconstructed, context is canceled or ErrByzantine is generated.
    60  func (r *Retriever) Retrieve(ctx context.Context, dah *da.DataAvailabilityHeader) (*rsmt2d.ExtendedDataSquare, error) {
    61  	ctx, cancel := context.WithCancel(ctx)
    62  	defer cancel() // cancels all the ongoing requests if reconstruction succeeds early
    63  
    64  	ctx, span := tracer.Start(ctx, "retrieve-square")
    65  	defer span.End()
    66  	span.SetAttributes(
    67  		attribute.Int("size", len(dah.RowRoots)),
    68  	)
    69  
    70  	log.Debugw("retrieving data square", "data_hash", dah.String(), "size", len(dah.RowRoots))
    71  	ses, err := r.newSession(ctx, dah)
    72  	if err != nil {
    73  		return nil, err
    74  	}
    75  	defer ses.Close()
    76  
    77  	// wait for a signal to start reconstruction
    78  	// try until either success or context or bad data
    79  	for {
    80  		select {
    81  		case <-ses.Done():
    82  			eds, err := ses.Reconstruct(ctx)
    83  			if err == nil {
    84  				span.SetStatus(codes.Ok, "square-retrieved")
    85  				return eds, nil
    86  			}
    87  			// check to ensure it is not a catastrophic ErrByzantine case, otherwise handle accordingly
    88  			var errByz *rsmt2d.ErrByzantineData
    89  			if errors.As(err, &errByz) {
    90  				span.RecordError(err)
    91  				return nil, byzantine.NewErrByzantine(ctx, r.bServ, dah, errByz)
    92  			}
    93  
    94  			log.Warnw("not enough shares to reconstruct data square, requesting more...", "err", err)
    95  		case <-ctx.Done():
    96  			return nil, ctx.Err()
    97  		}
    98  	}
    99  }
   100  
   101  // retrievalSession represents a data square retrieval session.
   102  // It manages one data square that is being retrieved and
   103  // quadrant request retries. Also, provides an API
   104  // to reconstruct the block once enough shares are fetched.
   105  type retrievalSession struct {
   106  	dah  *da.DataAvailabilityHeader
   107  	bget blockservice.BlockGetter
   108  
   109  	// TODO(@Wondertan): Extract into a separate data structure
   110  	// https://github.com/celestiaorg/rsmt2d/issues/135
   111  	squareQuadrants  []*quadrant
   112  	squareCellsLks   [][]sync.Mutex
   113  	squareCellsCount uint32
   114  	squareSig        chan struct{}
   115  	squareDn         chan struct{}
   116  	squareLk         sync.RWMutex
   117  	square           *rsmt2d.ExtendedDataSquare
   118  
   119  	span trace.Span
   120  }
   121  
   122  // newSession creates a new retrieval session and kicks off requesting process.
   123  func (r *Retriever) newSession(ctx context.Context, dah *da.DataAvailabilityHeader) (*retrievalSession, error) {
   124  	size := len(dah.RowRoots)
   125  
   126  	treeFn := func(_ rsmt2d.Axis, index uint) rsmt2d.Tree {
   127  		// use proofs adder if provided, to cache collected proofs while recomputing the eds
   128  		var opts []nmt.Option
   129  		visitor := ipld.ProofsAdderFromCtx(ctx).VisitFn()
   130  		if visitor != nil {
   131  			opts = append(opts, nmt.NodeVisitor(visitor))
   132  		}
   133  
   134  		tree := wrapper.NewErasuredNamespacedMerkleTree(uint64(size)/2, index, opts...)
   135  		return &tree
   136  	}
   137  
   138  	square, err := rsmt2d.NewExtendedDataSquare(share.DefaultRSMT2DCodec(), treeFn, uint(size), share.Size)
   139  	if err != nil {
   140  		return nil, err
   141  	}
   142  
   143  	ses := &retrievalSession{
   144  		dah:             dah,
   145  		bget:            blockservice.NewSession(ctx, r.bServ),
   146  		squareQuadrants: newQuadrants(dah),
   147  		squareCellsLks:  make([][]sync.Mutex, size),
   148  		squareSig:       make(chan struct{}, 1),
   149  		squareDn:        make(chan struct{}),
   150  		square:          square,
   151  		span:            trace.SpanFromContext(ctx),
   152  	}
   153  	for i := range ses.squareCellsLks {
   154  		ses.squareCellsLks[i] = make([]sync.Mutex, size)
   155  	}
   156  
   157  	go ses.request(ctx)
   158  	return ses, nil
   159  }
   160  
   161  // Done signals that enough shares have been retrieved to attempt
   162  // square reconstruction. "Attempt" because there is no way currently to
   163  // guarantee that reconstruction can be performed with the shares provided.
   164  func (rs *retrievalSession) Done() <-chan struct{} {
   165  	return rs.squareSig
   166  }
   167  
   168  // Reconstruct tries to reconstruct the data square and returns it on success.
   169  func (rs *retrievalSession) Reconstruct(ctx context.Context) (*rsmt2d.ExtendedDataSquare, error) {
   170  	if rs.isReconstructed() {
   171  		return rs.square, nil
   172  	}
   173  	// prevent further writes to the square
   174  	rs.squareLk.Lock()
   175  	defer rs.squareLk.Unlock()
   176  
   177  	_, span := tracer.Start(ctx, "reconstruct-square")
   178  	defer span.End()
   179  
   180  	// and try to repair with what we have
   181  	err := rs.square.Repair(rs.dah.RowRoots, rs.dah.ColumnRoots)
   182  	if err != nil {
   183  		span.RecordError(err)
   184  		return nil, err
   185  	}
   186  	log.Infow("data square reconstructed", "data_hash", rs.dah.String(), "size", len(rs.dah.RowRoots))
   187  	close(rs.squareDn)
   188  	return rs.square, nil
   189  }
   190  
   191  // isReconstructed report true whether the square attached to the session
   192  // is already reconstructed.
   193  func (rs *retrievalSession) isReconstructed() bool {
   194  	select {
   195  	case <-rs.squareDn:
   196  		// return early if square is already reconstructed
   197  		return true
   198  	default:
   199  		return false
   200  	}
   201  }
   202  
   203  func (rs *retrievalSession) Close() error {
   204  	defer rs.span.End()
   205  	return nil
   206  }
   207  
   208  // request kicks off quadrants requests.
   209  // It instantly requests a quadrant and periodically requests more
   210  // until either context is canceled or we are out of quadrants.
   211  func (rs *retrievalSession) request(ctx context.Context) {
   212  	t := time.NewTicker(RetrieveQuadrantTimeout)
   213  	defer t.Stop()
   214  	for retry := 0; retry < len(rs.squareQuadrants); retry++ {
   215  		q := rs.squareQuadrants[retry]
   216  		log.Debugw("requesting quadrant",
   217  			"axis", q.source,
   218  			"x", q.x,
   219  			"y", q.y,
   220  			"size", len(q.roots),
   221  		)
   222  		rs.span.AddEvent("requesting quadrant", trace.WithAttributes(
   223  			attribute.Int("axis", int(q.source)),
   224  			attribute.Int("x", q.x),
   225  			attribute.Int("y", q.y),
   226  			attribute.Int("size", len(q.roots)),
   227  		))
   228  		rs.doRequest(ctx, q)
   229  		select {
   230  		case <-t.C:
   231  		case <-ctx.Done():
   232  			return
   233  		}
   234  		log.Warnw("quadrant request timeout",
   235  			"timeout", RetrieveQuadrantTimeout.String(),
   236  			"axis", q.source,
   237  			"x", q.x,
   238  			"y", q.y,
   239  			"size", len(q.roots),
   240  		)
   241  		rs.span.AddEvent("quadrant request timeout", trace.WithAttributes(
   242  			attribute.Int("axis", int(q.source)),
   243  			attribute.Int("x", q.x),
   244  			attribute.Int("y", q.y),
   245  			attribute.Int("size", len(q.roots)),
   246  		))
   247  	}
   248  }
   249  
   250  // doRequest requests the given quadrant by requesting halves of axis(Row or Col) using GetShares
   251  // and fills shares into rs.square slice.
   252  func (rs *retrievalSession) doRequest(ctx context.Context, q *quadrant) {
   253  	size := len(q.roots)
   254  	for i, root := range q.roots {
   255  		go func(i int, root cid.Cid) {
   256  			// get the root node
   257  			nd, err := ipld.GetNode(ctx, rs.bget, root)
   258  			if err != nil {
   259  				rs.span.RecordError(err, trace.WithAttributes(
   260  					attribute.Int("root-index", i),
   261  				))
   262  				return
   263  			}
   264  			// and go get shares of left or the right side of the whole col/row axis
   265  			// the left or the right side of the tree represent some portion of the quadrant
   266  			// which we put into the rs.square share-by-share by calculating shares' indexes using q.index
   267  			ipld.GetShares(ctx, rs.bget, nd.Links()[q.x].Cid, size, func(j int, share share.Share) {
   268  				// NOTE: Each share can appear twice here, for a Row and Col, respectively.
   269  				// These shares are always equal, and we allow only the first one to be written
   270  				// in the square.
   271  				// NOTE-2: We may never actually fetch shares from the network *twice*.
   272  				// Once a share is downloaded from the network it may be cached on the IPLD(blockservice) level.
   273  				//
   274  				// calc position of the share
   275  				x, y := q.pos(i, j)
   276  				// try to lock the share
   277  				ok := rs.squareCellsLks[x][y].TryLock()
   278  				if !ok {
   279  					// if already locked and written - do nothing
   280  					return
   281  				}
   282  				// The R lock here is *not* to protect rs.square from multiple
   283  				// concurrent shares writes but to avoid races between share writes and
   284  				// repairing attempts.
   285  				// Shares are written atomically in their own slice slots and these "writes" do
   286  				// not need synchronization!
   287  				rs.squareLk.RLock()
   288  				defer rs.squareLk.RUnlock()
   289  				// the routine could be blocked above for some time during which the square
   290  				// might be reconstructed, if so don't write anything and return
   291  				if rs.isReconstructed() {
   292  					return
   293  				}
   294  				if err := rs.square.SetCell(uint(x), uint(y), share); err != nil {
   295  					// safe to ignore as:
   296  					// * share size already verified
   297  					// * the same share might come from either Row or Col
   298  					return
   299  				}
   300  				// if we have >= 1/4 of the square we can start trying to Reconstruct
   301  				// TODO(@Wondertan): This is not an ideal way to know when to start
   302  				//  reconstruction and can cause idle reconstruction tries in some cases,
   303  				//  but it is totally fine for the happy case and for now.
   304  				//  The earlier we correctly know that we have the full square - the earlier
   305  				//  we cancel ongoing requests - the less data is being wastedly transferred.
   306  				if atomic.AddUint32(&rs.squareCellsCount, 1) >= uint32(size*size) {
   307  					select {
   308  					case rs.squareSig <- struct{}{}:
   309  					default:
   310  					}
   311  				}
   312  			})
   313  		}(i, root)
   314  	}
   315  }