github.com/celestiaorg/celestia-node@v0.15.0-beta.1/share/eds/retriever.go (about) 1 package eds 2 3 import ( 4 "context" 5 "errors" 6 "sync" 7 "sync/atomic" 8 "time" 9 10 "github.com/ipfs/boxo/blockservice" 11 "github.com/ipfs/go-cid" 12 logging "github.com/ipfs/go-log/v2" 13 "go.opentelemetry.io/otel" 14 "go.opentelemetry.io/otel/attribute" 15 "go.opentelemetry.io/otel/codes" 16 "go.opentelemetry.io/otel/trace" 17 18 "github.com/celestiaorg/celestia-app/pkg/da" 19 "github.com/celestiaorg/celestia-app/pkg/wrapper" 20 "github.com/celestiaorg/nmt" 21 "github.com/celestiaorg/rsmt2d" 22 23 "github.com/celestiaorg/celestia-node/share" 24 "github.com/celestiaorg/celestia-node/share/eds/byzantine" 25 "github.com/celestiaorg/celestia-node/share/ipld" 26 ) 27 28 var ( 29 log = logging.Logger("share/eds") 30 tracer = otel.Tracer("share/eds") 31 ) 32 33 // Retriever retrieves rsmt2d.ExtendedDataSquares from the IPLD network. 34 // Instead of requesting data 'share by share' it requests data by quadrants 35 // minimizing bandwidth usage in the happy cases. 36 // 37 // ---- ---- 38 // | 0 | 1 | 39 // ---- ---- 40 // | 2 | 3 | 41 // ---- ---- 42 // 43 // Retriever randomly picks one of the data square quadrants and tries to request them one by one 44 // until it is able to reconstruct the whole square. 45 type Retriever struct { 46 bServ blockservice.BlockService 47 } 48 49 // NewRetriever creates a new instance of the Retriever over IPLD BlockService and rmst2d.Codec 50 func NewRetriever(bServ blockservice.BlockService) *Retriever { 51 return &Retriever{bServ: bServ} 52 } 53 54 // Retrieve retrieves all the data committed to DataAvailabilityHeader. 55 // 56 // If not available locally, it aims to request from the network only one quadrant (1/4) of the 57 // data square and reconstructs the other three quadrants (3/4). If the requested quadrant is not 58 // available within RetrieveQuadrantTimeout, it starts requesting another quadrant until either the 59 // data is reconstructed, context is canceled or ErrByzantine is generated. 60 func (r *Retriever) Retrieve(ctx context.Context, dah *da.DataAvailabilityHeader) (*rsmt2d.ExtendedDataSquare, error) { 61 ctx, cancel := context.WithCancel(ctx) 62 defer cancel() // cancels all the ongoing requests if reconstruction succeeds early 63 64 ctx, span := tracer.Start(ctx, "retrieve-square") 65 defer span.End() 66 span.SetAttributes( 67 attribute.Int("size", len(dah.RowRoots)), 68 ) 69 70 log.Debugw("retrieving data square", "data_hash", dah.String(), "size", len(dah.RowRoots)) 71 ses, err := r.newSession(ctx, dah) 72 if err != nil { 73 return nil, err 74 } 75 defer ses.Close() 76 77 // wait for a signal to start reconstruction 78 // try until either success or context or bad data 79 for { 80 select { 81 case <-ses.Done(): 82 eds, err := ses.Reconstruct(ctx) 83 if err == nil { 84 span.SetStatus(codes.Ok, "square-retrieved") 85 return eds, nil 86 } 87 // check to ensure it is not a catastrophic ErrByzantine case, otherwise handle accordingly 88 var errByz *rsmt2d.ErrByzantineData 89 if errors.As(err, &errByz) { 90 span.RecordError(err) 91 return nil, byzantine.NewErrByzantine(ctx, r.bServ, dah, errByz) 92 } 93 94 log.Warnw("not enough shares to reconstruct data square, requesting more...", "err", err) 95 case <-ctx.Done(): 96 return nil, ctx.Err() 97 } 98 } 99 } 100 101 // retrievalSession represents a data square retrieval session. 102 // It manages one data square that is being retrieved and 103 // quadrant request retries. Also, provides an API 104 // to reconstruct the block once enough shares are fetched. 105 type retrievalSession struct { 106 dah *da.DataAvailabilityHeader 107 bget blockservice.BlockGetter 108 109 // TODO(@Wondertan): Extract into a separate data structure 110 // https://github.com/celestiaorg/rsmt2d/issues/135 111 squareQuadrants []*quadrant 112 squareCellsLks [][]sync.Mutex 113 squareCellsCount uint32 114 squareSig chan struct{} 115 squareDn chan struct{} 116 squareLk sync.RWMutex 117 square *rsmt2d.ExtendedDataSquare 118 119 span trace.Span 120 } 121 122 // newSession creates a new retrieval session and kicks off requesting process. 123 func (r *Retriever) newSession(ctx context.Context, dah *da.DataAvailabilityHeader) (*retrievalSession, error) { 124 size := len(dah.RowRoots) 125 126 treeFn := func(_ rsmt2d.Axis, index uint) rsmt2d.Tree { 127 // use proofs adder if provided, to cache collected proofs while recomputing the eds 128 var opts []nmt.Option 129 visitor := ipld.ProofsAdderFromCtx(ctx).VisitFn() 130 if visitor != nil { 131 opts = append(opts, nmt.NodeVisitor(visitor)) 132 } 133 134 tree := wrapper.NewErasuredNamespacedMerkleTree(uint64(size)/2, index, opts...) 135 return &tree 136 } 137 138 square, err := rsmt2d.NewExtendedDataSquare(share.DefaultRSMT2DCodec(), treeFn, uint(size), share.Size) 139 if err != nil { 140 return nil, err 141 } 142 143 ses := &retrievalSession{ 144 dah: dah, 145 bget: blockservice.NewSession(ctx, r.bServ), 146 squareQuadrants: newQuadrants(dah), 147 squareCellsLks: make([][]sync.Mutex, size), 148 squareSig: make(chan struct{}, 1), 149 squareDn: make(chan struct{}), 150 square: square, 151 span: trace.SpanFromContext(ctx), 152 } 153 for i := range ses.squareCellsLks { 154 ses.squareCellsLks[i] = make([]sync.Mutex, size) 155 } 156 157 go ses.request(ctx) 158 return ses, nil 159 } 160 161 // Done signals that enough shares have been retrieved to attempt 162 // square reconstruction. "Attempt" because there is no way currently to 163 // guarantee that reconstruction can be performed with the shares provided. 164 func (rs *retrievalSession) Done() <-chan struct{} { 165 return rs.squareSig 166 } 167 168 // Reconstruct tries to reconstruct the data square and returns it on success. 169 func (rs *retrievalSession) Reconstruct(ctx context.Context) (*rsmt2d.ExtendedDataSquare, error) { 170 if rs.isReconstructed() { 171 return rs.square, nil 172 } 173 // prevent further writes to the square 174 rs.squareLk.Lock() 175 defer rs.squareLk.Unlock() 176 177 _, span := tracer.Start(ctx, "reconstruct-square") 178 defer span.End() 179 180 // and try to repair with what we have 181 err := rs.square.Repair(rs.dah.RowRoots, rs.dah.ColumnRoots) 182 if err != nil { 183 span.RecordError(err) 184 return nil, err 185 } 186 log.Infow("data square reconstructed", "data_hash", rs.dah.String(), "size", len(rs.dah.RowRoots)) 187 close(rs.squareDn) 188 return rs.square, nil 189 } 190 191 // isReconstructed report true whether the square attached to the session 192 // is already reconstructed. 193 func (rs *retrievalSession) isReconstructed() bool { 194 select { 195 case <-rs.squareDn: 196 // return early if square is already reconstructed 197 return true 198 default: 199 return false 200 } 201 } 202 203 func (rs *retrievalSession) Close() error { 204 defer rs.span.End() 205 return nil 206 } 207 208 // request kicks off quadrants requests. 209 // It instantly requests a quadrant and periodically requests more 210 // until either context is canceled or we are out of quadrants. 211 func (rs *retrievalSession) request(ctx context.Context) { 212 t := time.NewTicker(RetrieveQuadrantTimeout) 213 defer t.Stop() 214 for retry := 0; retry < len(rs.squareQuadrants); retry++ { 215 q := rs.squareQuadrants[retry] 216 log.Debugw("requesting quadrant", 217 "axis", q.source, 218 "x", q.x, 219 "y", q.y, 220 "size", len(q.roots), 221 ) 222 rs.span.AddEvent("requesting quadrant", trace.WithAttributes( 223 attribute.Int("axis", int(q.source)), 224 attribute.Int("x", q.x), 225 attribute.Int("y", q.y), 226 attribute.Int("size", len(q.roots)), 227 )) 228 rs.doRequest(ctx, q) 229 select { 230 case <-t.C: 231 case <-ctx.Done(): 232 return 233 } 234 log.Warnw("quadrant request timeout", 235 "timeout", RetrieveQuadrantTimeout.String(), 236 "axis", q.source, 237 "x", q.x, 238 "y", q.y, 239 "size", len(q.roots), 240 ) 241 rs.span.AddEvent("quadrant request timeout", trace.WithAttributes( 242 attribute.Int("axis", int(q.source)), 243 attribute.Int("x", q.x), 244 attribute.Int("y", q.y), 245 attribute.Int("size", len(q.roots)), 246 )) 247 } 248 } 249 250 // doRequest requests the given quadrant by requesting halves of axis(Row or Col) using GetShares 251 // and fills shares into rs.square slice. 252 func (rs *retrievalSession) doRequest(ctx context.Context, q *quadrant) { 253 size := len(q.roots) 254 for i, root := range q.roots { 255 go func(i int, root cid.Cid) { 256 // get the root node 257 nd, err := ipld.GetNode(ctx, rs.bget, root) 258 if err != nil { 259 rs.span.RecordError(err, trace.WithAttributes( 260 attribute.Int("root-index", i), 261 )) 262 return 263 } 264 // and go get shares of left or the right side of the whole col/row axis 265 // the left or the right side of the tree represent some portion of the quadrant 266 // which we put into the rs.square share-by-share by calculating shares' indexes using q.index 267 ipld.GetShares(ctx, rs.bget, nd.Links()[q.x].Cid, size, func(j int, share share.Share) { 268 // NOTE: Each share can appear twice here, for a Row and Col, respectively. 269 // These shares are always equal, and we allow only the first one to be written 270 // in the square. 271 // NOTE-2: We may never actually fetch shares from the network *twice*. 272 // Once a share is downloaded from the network it may be cached on the IPLD(blockservice) level. 273 // 274 // calc position of the share 275 x, y := q.pos(i, j) 276 // try to lock the share 277 ok := rs.squareCellsLks[x][y].TryLock() 278 if !ok { 279 // if already locked and written - do nothing 280 return 281 } 282 // The R lock here is *not* to protect rs.square from multiple 283 // concurrent shares writes but to avoid races between share writes and 284 // repairing attempts. 285 // Shares are written atomically in their own slice slots and these "writes" do 286 // not need synchronization! 287 rs.squareLk.RLock() 288 defer rs.squareLk.RUnlock() 289 // the routine could be blocked above for some time during which the square 290 // might be reconstructed, if so don't write anything and return 291 if rs.isReconstructed() { 292 return 293 } 294 if err := rs.square.SetCell(uint(x), uint(y), share); err != nil { 295 // safe to ignore as: 296 // * share size already verified 297 // * the same share might come from either Row or Col 298 return 299 } 300 // if we have >= 1/4 of the square we can start trying to Reconstruct 301 // TODO(@Wondertan): This is not an ideal way to know when to start 302 // reconstruction and can cause idle reconstruction tries in some cases, 303 // but it is totally fine for the happy case and for now. 304 // The earlier we correctly know that we have the full square - the earlier 305 // we cancel ongoing requests - the less data is being wastedly transferred. 306 if atomic.AddUint32(&rs.squareCellsCount, 1) >= uint32(size*size) { 307 select { 308 case rs.squareSig <- struct{}{}: 309 default: 310 } 311 } 312 }) 313 }(i, root) 314 } 315 }