github.com/ethersphere/bee/v2@v2.2.0/pkg/retrieval/retrieval.go (about)

     1  // Copyright 2020 The Swarm Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package retrieval provides the retrieval protocol
     6  // implementation. The protocol is used to retrieve
     7  // chunks over the network using forwarding-kademlia
     8  // routing.
     9  package retrieval
    10  
    11  import (
    12  	"context"
    13  	"errors"
    14  	"fmt"
    15  	"time"
    16  
    17  	"github.com/ethersphere/bee/v2/pkg/accounting"
    18  	"github.com/ethersphere/bee/v2/pkg/cac"
    19  	"github.com/ethersphere/bee/v2/pkg/log"
    20  	"github.com/ethersphere/bee/v2/pkg/p2p"
    21  	"github.com/ethersphere/bee/v2/pkg/p2p/protobuf"
    22  	"github.com/ethersphere/bee/v2/pkg/pricer"
    23  	pb "github.com/ethersphere/bee/v2/pkg/retrieval/pb"
    24  	"github.com/ethersphere/bee/v2/pkg/skippeers"
    25  	"github.com/ethersphere/bee/v2/pkg/soc"
    26  	storage "github.com/ethersphere/bee/v2/pkg/storage"
    27  	"github.com/ethersphere/bee/v2/pkg/swarm"
    28  	"github.com/ethersphere/bee/v2/pkg/topology"
    29  	"github.com/ethersphere/bee/v2/pkg/tracing"
    30  	"github.com/opentracing/opentracing-go"
    31  	"github.com/opentracing/opentracing-go/ext"
    32  	olog "github.com/opentracing/opentracing-go/log"
    33  	"resenje.org/singleflight"
    34  )
    35  
    36  // loggerName is the tree path name of the logger for this package.
    37  const loggerName = "retrieval"
    38  
    39  const (
    40  	protocolName    = "retrieval"
    41  	protocolVersion = "1.4.0"
    42  	streamName      = "retrieval"
    43  )
    44  
    45  var _ Interface = (*Service)(nil)
    46  
    47  type Interface interface {
    48  	// RetrieveChunk retrieves a chunk from the network using the retrieval protocol.
    49  	// it takes as parameters a context, a chunk address to retrieve (content-addressed or single-owner) and
    50  	// a source peer address, for the case that we are requesting the chunk for another peer. In case the request
    51  	// originates at the current node (i.e. no forwarding involved), the caller should use swarm.ZeroAddress
    52  	// as the value for sourcePeerAddress.
    53  	RetrieveChunk(ctx context.Context, address, sourcePeerAddr swarm.Address) (chunk swarm.Chunk, err error)
    54  }
    55  
    56  type retrievalResult struct {
    57  	chunk swarm.Chunk
    58  	peer  swarm.Address
    59  	err   error
    60  }
    61  
    62  type Storer interface {
    63  	Cache() storage.Putter
    64  	Lookup() storage.Getter
    65  }
    66  
    67  type Service struct {
    68  	addr          swarm.Address
    69  	radiusFunc    func() (uint8, error)
    70  	streamer      p2p.Streamer
    71  	peerSuggester topology.ClosestPeerer
    72  	storer        Storer
    73  	singleflight  singleflight.Group[string, swarm.Chunk]
    74  	logger        log.Logger
    75  	accounting    accounting.Interface
    76  	metrics       metrics
    77  	pricer        pricer.Interface
    78  	tracer        *tracing.Tracer
    79  	caching       bool
    80  	errSkip       *skippeers.List
    81  }
    82  
    83  func New(
    84  	addr swarm.Address,
    85  	radiusFunc func() (uint8, error),
    86  	storer Storer,
    87  	streamer p2p.Streamer,
    88  	chunkPeerer topology.ClosestPeerer,
    89  	logger log.Logger,
    90  	accounting accounting.Interface,
    91  	pricer pricer.Interface,
    92  	tracer *tracing.Tracer,
    93  	forwarderCaching bool,
    94  ) *Service {
    95  	return &Service{
    96  		addr:          addr,
    97  		radiusFunc:    radiusFunc,
    98  		streamer:      streamer,
    99  		peerSuggester: chunkPeerer,
   100  		storer:        storer,
   101  		logger:        logger.WithName(loggerName).Register(),
   102  		accounting:    accounting,
   103  		pricer:        pricer,
   104  		metrics:       newMetrics(),
   105  		tracer:        tracer,
   106  		caching:       forwarderCaching,
   107  		errSkip:       skippeers.NewList(),
   108  	}
   109  }
   110  
   111  func (s *Service) Protocol() p2p.ProtocolSpec {
   112  	return p2p.ProtocolSpec{
   113  		Name:    protocolName,
   114  		Version: protocolVersion,
   115  		StreamSpecs: []p2p.StreamSpec{
   116  			{
   117  				Name:    streamName,
   118  				Handler: s.handler,
   119  			},
   120  		},
   121  	}
   122  }
   123  
   124  const (
   125  	RetrieveChunkTimeout = time.Second * 30
   126  	preemptiveInterval   = time.Second
   127  	overDraftRefresh     = time.Millisecond * 600
   128  	skiplistDur          = time.Minute
   129  	originSuffix         = "_origin"
   130  	maxOriginErrors      = 32
   131  	maxMultiplexForwards = 2
   132  )
   133  
   134  func (s *Service) RetrieveChunk(ctx context.Context, chunkAddr, sourcePeerAddr swarm.Address) (swarm.Chunk, error) {
   135  	loggerV1 := s.logger
   136  
   137  	s.metrics.RequestCounter.Inc()
   138  
   139  	origin := sourcePeerAddr.IsZero()
   140  
   141  	if chunkAddr.IsZero() || chunkAddr.IsEmpty() || !chunkAddr.IsValidLength() {
   142  		return nil, fmt.Errorf("invalid address queried")
   143  	}
   144  
   145  	flightRoute := chunkAddr.String()
   146  	if origin {
   147  		flightRoute = chunkAddr.String() + originSuffix
   148  	}
   149  
   150  	totalRetrieveAttempts := 0
   151  	requestStartTime := time.Now()
   152  	defer func() {
   153  		s.metrics.RequestDurationTime.Observe(time.Since(requestStartTime).Seconds())
   154  		s.metrics.RequestAttempts.Observe(float64(totalRetrieveAttempts))
   155  	}()
   156  
   157  	spanCtx := context.WithoutCancel(ctx)
   158  
   159  	v, _, err := s.singleflight.Do(ctx, flightRoute, func(ctx context.Context) (swarm.Chunk, error) {
   160  
   161  		skip := skippeers.NewList()
   162  		defer skip.Close()
   163  
   164  		var preemptiveTicker <-chan time.Time
   165  
   166  		if !sourcePeerAddr.IsZero() {
   167  			skip.Forever(chunkAddr, sourcePeerAddr)
   168  		}
   169  
   170  		quit := make(chan struct{})
   171  		defer close(quit)
   172  
   173  		var forwards = maxMultiplexForwards
   174  
   175  		// if we are the origin node, allow many preemptive retries to speed up the retrieval of the chunk.
   176  		errorsLeft := 1
   177  		if origin {
   178  			ticker := time.NewTicker(preemptiveInterval)
   179  			defer ticker.Stop()
   180  			preemptiveTicker = ticker.C
   181  			errorsLeft = maxOriginErrors
   182  		}
   183  
   184  		resultC := make(chan retrievalResult, 1)
   185  		retryC := make(chan struct{}, forwards+1)
   186  
   187  		retry := func() {
   188  			select {
   189  			case retryC <- struct{}{}:
   190  			case <-ctx.Done():
   191  			default:
   192  			}
   193  		}
   194  
   195  		retry()
   196  
   197  		inflight := 0
   198  
   199  		for errorsLeft > 0 {
   200  
   201  			select {
   202  			case <-ctx.Done():
   203  				return nil, ctx.Err()
   204  			case <-preemptiveTicker:
   205  				retry()
   206  			case <-retryC:
   207  
   208  				totalRetrieveAttempts++
   209  				s.metrics.PeerRequestCounter.Inc()
   210  
   211  				fullSkip := append(skip.ChunkPeers(chunkAddr), s.errSkip.ChunkPeers(chunkAddr)...)
   212  				peer, err := s.closestPeer(chunkAddr, fullSkip, origin)
   213  
   214  				if errors.Is(err, topology.ErrNotFound) {
   215  					if skip.PruneExpiresAfter(chunkAddr, overDraftRefresh) == 0 { //no overdraft peers, we have depleted ALL peers
   216  						if inflight == 0 {
   217  							loggerV1.Debug("no peers left", "chunk_address", chunkAddr, "errors_left", errorsLeft, "isOrigin", origin, "own_proximity", swarm.Proximity(s.addr.Bytes(), chunkAddr.Bytes()), "error", err)
   218  							return nil, err
   219  						}
   220  						continue // there is still an inflight request, wait for it's result
   221  					}
   222  
   223  					loggerV1.Debug("sleeping to refresh overdraft balance", "chunk_address", chunkAddr)
   224  
   225  					select {
   226  					case <-time.After(overDraftRefresh):
   227  						retry()
   228  						continue
   229  					case <-ctx.Done():
   230  						return nil, ctx.Err()
   231  					}
   232  				}
   233  
   234  				if err != nil {
   235  					if inflight == 0 {
   236  						loggerV1.Debug("peer selection", "chunk_address", chunkAddr, "error", err)
   237  						return nil, err
   238  					}
   239  					continue
   240  				}
   241  
   242  				// since we can reach into the neighborhood of the chunk
   243  				// act as the multiplexer and push the chunk in parallel to multiple peers.
   244  				// neighbor peers will also have multiple retries, which means almost the entire neighborhood
   245  				// will be scanned for the chunk, starting from the closest to the furthest peer in the neighborhood.
   246  				if radius, err := s.radiusFunc(); err == nil && swarm.Proximity(peer.Bytes(), chunkAddr.Bytes()) >= radius {
   247  					for ; forwards > 0; forwards-- {
   248  						retry()
   249  						errorsLeft++
   250  					}
   251  				}
   252  
   253  				action, err := s.prepareCredit(ctx, peer, chunkAddr, origin)
   254  				if err != nil {
   255  					skip.Add(chunkAddr, peer, overDraftRefresh)
   256  					retry()
   257  					continue
   258  				}
   259  				skip.Forever(chunkAddr, peer)
   260  
   261  				inflight++
   262  
   263  				go func() {
   264  					span, _, ctx := s.tracer.FollowSpanFromContext(spanCtx, "retrieve-chunk", s.logger, opentracing.Tag{Key: "address", Value: chunkAddr.String()})
   265  					defer span.Finish()
   266  					s.retrieveChunk(ctx, quit, chunkAddr, peer, resultC, action, span)
   267  				}()
   268  
   269  			case res := <-resultC:
   270  
   271  				inflight--
   272  
   273  				if res.err == nil {
   274  					loggerV1.Debug("retrieved chunk", "chunk_address", chunkAddr, "peer_address", res.peer, "peer_proximity", swarm.Proximity(res.peer.Bytes(), chunkAddr.Bytes()))
   275  					return res.chunk, nil
   276  				}
   277  
   278  				loggerV1.Debug("failed to get chunk", "chunk_address", chunkAddr, "peer_address", res.peer,
   279  					"peer_proximity", swarm.Proximity(res.peer.Bytes(), chunkAddr.Bytes()), "error", res.err)
   280  
   281  				errorsLeft--
   282  				s.errSkip.Add(chunkAddr, res.peer, skiplistDur)
   283  				retry()
   284  			}
   285  		}
   286  
   287  		return nil, storage.ErrNotFound
   288  	})
   289  	if err != nil {
   290  		s.metrics.RequestFailureCounter.Inc()
   291  		s.logger.Debug("retrieval failed", "chunk_address", chunkAddr, "error", err)
   292  		return nil, err
   293  	}
   294  
   295  	s.metrics.RequestSuccessCounter.Inc()
   296  
   297  	return v, nil
   298  }
   299  
   300  func (s *Service) retrieveChunk(ctx context.Context, quit chan struct{}, chunkAddr, peer swarm.Address, result chan retrievalResult, action accounting.Action, span opentracing.Span) {
   301  
   302  	var (
   303  		startTime = time.Now()
   304  		err       error
   305  		chunk     swarm.Chunk
   306  	)
   307  
   308  	defer func() {
   309  		action.Cleanup()
   310  		if err != nil {
   311  			ext.LogError(span, err)
   312  			s.metrics.TotalErrors.Inc()
   313  		} else {
   314  			span.LogFields(olog.Bool("success", true))
   315  		}
   316  		select {
   317  		case result <- retrievalResult{err: err, chunk: chunk, peer: peer}:
   318  		case <-quit:
   319  			return
   320  		}
   321  	}()
   322  
   323  	ctx, cancel := context.WithTimeout(ctx, RetrieveChunkTimeout)
   324  	defer cancel()
   325  
   326  	stream, err := s.streamer.NewStream(ctx, peer, nil, protocolName, protocolVersion, streamName)
   327  	if err != nil {
   328  		err = fmt.Errorf("new stream: %w", err)
   329  		return
   330  	}
   331  
   332  	defer func() {
   333  		if err != nil {
   334  			_ = stream.Reset()
   335  		} else {
   336  			_ = stream.FullClose()
   337  		}
   338  	}()
   339  
   340  	w, r := protobuf.NewWriterAndReader(stream)
   341  	err = w.WriteMsgWithContext(ctx, &pb.Request{Addr: chunkAddr.Bytes()})
   342  	if err != nil {
   343  		err = fmt.Errorf("write request: %w peer %s", err, peer.String())
   344  		return
   345  	}
   346  
   347  	var d pb.Delivery
   348  	if err = r.ReadMsgWithContext(ctx, &d); err != nil {
   349  		err = fmt.Errorf("read delivery: %w peer %s", err, peer.String())
   350  		return
   351  	}
   352  	if d.Err != "" {
   353  		err = p2p.NewChunkDeliveryError(d.Err)
   354  		return
   355  	}
   356  
   357  	s.metrics.ChunkRetrieveTime.Observe(time.Since(startTime).Seconds())
   358  	s.metrics.TotalRetrieved.Inc()
   359  
   360  	chunk = swarm.NewChunk(chunkAddr, d.Data)
   361  	if !cac.Valid(chunk) {
   362  		if !soc.Valid(chunk) {
   363  			s.metrics.InvalidChunkRetrieved.Inc()
   364  			err = swarm.ErrInvalidChunk
   365  			return
   366  		}
   367  	}
   368  
   369  	err = action.Apply()
   370  }
   371  
   372  func (s *Service) prepareCredit(ctx context.Context, peer, chunk swarm.Address, origin bool) (accounting.Action, error) {
   373  
   374  	price := s.pricer.PeerPrice(peer, chunk)
   375  	s.metrics.ChunkPrice.Observe(float64(price))
   376  
   377  	creditAction, err := s.accounting.PrepareCredit(ctx, peer, price, origin)
   378  	if err != nil {
   379  		return nil, err
   380  	}
   381  
   382  	return creditAction, nil
   383  }
   384  
   385  // closestPeer returns address of the peer that is closest to the chunk with
   386  // provided address addr. This function will ignore peers with addresses
   387  // provided in skipPeers and if allowUpstream is true, peers that are further of
   388  // the chunk than this node is, could also be returned, allowing the upstream
   389  // retrieve request.
   390  func (s *Service) closestPeer(addr swarm.Address, skipPeers []swarm.Address, allowUpstream bool) (swarm.Address, error) {
   391  
   392  	var (
   393  		closest swarm.Address
   394  		err     error
   395  	)
   396  
   397  	closest, err = s.peerSuggester.ClosestPeer(addr, false, topology.Select{Reachable: true, Healthy: true}, skipPeers...)
   398  	if errors.Is(err, topology.ErrNotFound) {
   399  		closest, err = s.peerSuggester.ClosestPeer(addr, false, topology.Select{Reachable: true}, skipPeers...)
   400  		if errors.Is(err, topology.ErrNotFound) {
   401  			closest, err = s.peerSuggester.ClosestPeer(addr, false, topology.Select{}, skipPeers...)
   402  		}
   403  	}
   404  
   405  	if err != nil {
   406  		return swarm.Address{}, err
   407  	}
   408  
   409  	if allowUpstream {
   410  		return closest, nil
   411  	}
   412  
   413  	closer, err := closest.Closer(addr, s.addr)
   414  	if err != nil {
   415  		return swarm.Address{}, fmt.Errorf("distance compare addr %s closest %s base address %s: %w", addr.String(), closest.String(), s.addr.String(), err)
   416  	}
   417  	if !closer {
   418  		return swarm.Address{}, topology.ErrNotFound
   419  	}
   420  
   421  	return closest, nil
   422  }
   423  
   424  func (s *Service) handler(p2pctx context.Context, p p2p.Peer, stream p2p.Stream) (err error) {
   425  	ctx, cancel := context.WithTimeout(p2pctx, RetrieveChunkTimeout)
   426  	defer cancel()
   427  
   428  	w, r := protobuf.NewWriterAndReader(stream)
   429  	var attemptedWrite bool
   430  
   431  	defer func() {
   432  		if err != nil {
   433  			if !attemptedWrite {
   434  				_ = w.WriteMsgWithContext(ctx, &pb.Delivery{Err: err.Error()})
   435  			}
   436  			_ = stream.Reset()
   437  		} else {
   438  			_ = stream.FullClose()
   439  		}
   440  	}()
   441  	var req pb.Request
   442  	if err := r.ReadMsgWithContext(ctx, &req); err != nil {
   443  		return fmt.Errorf("read request: %w peer %s", err, p.Address.String())
   444  	}
   445  
   446  	addr := swarm.NewAddress(req.Addr)
   447  
   448  	if addr.IsZero() || addr.IsEmpty() || !addr.IsValidLength() {
   449  		return fmt.Errorf("invalid address queried by peer %s", p.Address.String())
   450  	}
   451  
   452  	var forwarded bool
   453  
   454  	span, _, ctx := s.tracer.StartSpanFromContext(ctx, "handle-retrieve-chunk", s.logger, opentracing.Tag{Key: "address", Value: addr.String()})
   455  	defer func() {
   456  		if err != nil {
   457  			ext.LogError(span, err)
   458  		} else {
   459  			span.LogFields(olog.Bool("success", true))
   460  		}
   461  		span.LogFields(olog.Bool("forwarded", forwarded))
   462  		span.Finish()
   463  	}()
   464  
   465  	chunk, err := s.storer.Lookup().Get(ctx, addr)
   466  	if err != nil {
   467  		if errors.Is(err, storage.ErrNotFound) {
   468  			// forward the request
   469  			chunk, err = s.RetrieveChunk(ctx, addr, p.Address)
   470  			if err != nil {
   471  				return fmt.Errorf("retrieve chunk: %w", err)
   472  			}
   473  			forwarded = true
   474  		} else {
   475  			return fmt.Errorf("get from store: %w", err)
   476  		}
   477  	}
   478  
   479  	chunkPrice := s.pricer.Price(chunk.Address())
   480  	debit, err := s.accounting.PrepareDebit(ctx, p.Address, chunkPrice)
   481  	if err != nil {
   482  		return fmt.Errorf("prepare debit to peer %s before writeback: %w", p.Address.String(), err)
   483  	}
   484  	defer debit.Cleanup()
   485  
   486  	attemptedWrite = true
   487  
   488  	if err := w.WriteMsgWithContext(ctx, &pb.Delivery{
   489  		Data: chunk.Data(),
   490  	}); err != nil {
   491  		return fmt.Errorf("write delivery: %w peer %s", err, p.Address.String())
   492  	}
   493  
   494  	// debit price from p's balance
   495  	if err := debit.Apply(); err != nil {
   496  		return fmt.Errorf("apply debit: %w", err)
   497  	}
   498  
   499  	// cache the request last, so that putting to the localstore does not slow down the request flow
   500  	if s.caching && forwarded {
   501  		if err := s.storer.Cache().Put(p2pctx, chunk); err != nil {
   502  			s.logger.Debug("retrieve cache put", "error", err)
   503  		}
   504  	}
   505  
   506  	return nil
   507  }
   508  
   509  func (s *Service) Close() error {
   510  	return s.errSkip.Close()
   511  }