github.com/anacrolix/torrent@v1.61.0/webseed/client.go

github.com/anacrolix/torrent@v1.61.0/webseed/client.go (about)

     1  package webseed
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"log/slog"
     9  	"net/http"
    10  	"os"
    11  	"runtime/pprof"
    12  	"strings"
    13  	"sync"
    14  
    15  	"github.com/RoaringBitmap/roaring"
    16  	g "github.com/anacrolix/generics"
    17  	"github.com/anacrolix/missinggo/v2/panicif"
    18  	"github.com/dustin/go-humanize"
    19  	"golang.org/x/time/rate"
    20  
    21  	"github.com/anacrolix/torrent/metainfo"
    22  	"github.com/anacrolix/torrent/segments"
    23  )
    24  
    25  // How many consecutive bytes to allow discarding from responses. This number is based on
    26  // https://archive.org/download/BloodyPitOfHorror/BloodyPitOfHorror.asr.srt. It seems that
    27  // archive.org might be using a webserver implementation that refuses to do partial responses to
    28  // small files. TODO: Make this configurable.
    29  const MaxDiscardBytes = 48 << 10
    30  
    31  // Output debug information to stdout.
    32  var PrintDebug = false
    33  
    34  func init() {
    35  	_, PrintDebug = os.LookupEnv("TORRENT_WEBSEED_DEBUG")
    36  }
    37  
    38  type RequestSpec = segments.Extent
    39  
    40  type requestPart struct {
    41  	req        *http.Request
    42  	fileRange  segments.Extent
    43  	fileLength int64
    44  	do         func() (*http.Response, error)
    45  	fileIndex  int
    46  }
    47  
    48  type Request struct {
    49  	// So you can view it from externally.
    50  	ctx    context.Context
    51  	cancel context.CancelCauseFunc
    52  	Body   io.Reader
    53  	// Closed with error to unstick copy routine when context isn't checked.
    54  	bodyPipe *io.PipeReader
    55  }
    56  
    57  func (r *Request) Context() context.Context {
    58  	return r.ctx
    59  }
    60  
    61  func (r *Request) Cancel(cause error) {
    62  	r.cancel(cause)
    63  }
    64  
    65  func (r *Request) Close() {
    66  	// We aren't cancelling because we want to know if we can keep receiving buffered data after
    67  	// cancellation. PipeReader.Close always returns nil.
    68  	_ = r.bodyPipe.Close()
    69  }
    70  
    71  type Client struct {
    72  	Logger     *slog.Logger
    73  	HttpClient *http.Client
    74  	Url        string
    75  	// Max concurrent requests to a WebSeed for a given torrent. TODO: Unused.
    76  	MaxRequests int
    77  
    78  	fileIndex *segments.Index
    79  	info      *metainfo.Info
    80  	// The pieces we can request with the Url. We're more likely to ban/block at the file-level
    81  	// given that's how requests are mapped to webseeds, but the torrent.Client works at the piece
    82  	// level. We can map our file-level adjustments to the pieces here. This probably need to be
    83  	// private in the future, if Client ever starts removing pieces. TODO: This belongs in
    84  	// webseedPeer. TODO: Unused.
    85  	Pieces roaring.Bitmap
    86  	// This wraps http.Response bodies, for example to limit the download rate.
    87  	ResponseBodyWrapper     ResponseBodyWrapper
    88  	ResponseBodyRateLimiter *rate.Limiter
    89  	PathEscaper             PathEscaper
    90  }
    91  
    92  type ResponseBodyWrapper func(r io.Reader, interrupt func()) io.Reader
    93  
    94  func (me *Client) SetInfo(info *metainfo.Info, fileIndex *segments.Index) {
    95  	if !strings.HasSuffix(me.Url, "/") && info.IsDir() {
    96  		// In my experience, this is a non-conforming webseed. For example the
    97  		// http://ia600500.us.archive.org/1/items URLs in archive.org torrents.
    98  		me.Logger.Warn("webseed URL does not end with / and torrent is a directory")
    99  		return
   100  	}
   101  	me.fileIndex = fileIndex
   102  	me.info = info
   103  	me.Pieces.AddRange(0, uint64(info.NumPieces()))
   104  }
   105  
   106  // Returns the URL for the given file index. This is assumed to be globally unique.
   107  func (ws *Client) UrlForFileIndex(fileIndex int) string {
   108  	return urlForFileIndex(ws.Url, fileIndex, ws.info, ws.PathEscaper)
   109  }
   110  
   111  func (ws *Client) StartNewRequest(ctx context.Context, r RequestSpec, debugLogger *slog.Logger) Request {
   112  	ctx, cancel := context.WithCancelCause(ctx)
   113  	var requestParts []requestPart
   114  	panicif.Nil(ws.fileIndex)
   115  	panicif.Nil(ws.info)
   116  	for i, e := range ws.fileIndex.LocateIter(r) {
   117  		req, err := newRequest(
   118  			ctx,
   119  			ws.Url, i, ws.info, e.Start, e.Length,
   120  			ws.PathEscaper,
   121  		)
   122  		panicif.Err(err)
   123  		part := requestPart{
   124  			req:        req,
   125  			fileRange:  e,
   126  			fileLength: ws.fileIndex.Index(i).Length,
   127  			fileIndex:  i,
   128  		}
   129  		part.do = func() (resp *http.Response, err error) {
   130  			resp, err = ws.HttpClient.Do(req)
   131  			if PrintDebug {
   132  				if err == nil {
   133  					debugLogger.Debug(
   134  						"request for part",
   135  						"url", req.URL,
   136  						"part-length", humanize.IBytes(uint64(e.Length)),
   137  						"part-file-offset", humanize.IBytes(uint64(e.Start)),
   138  						"file-length", humanize.IBytes(uint64(part.fileLength)),
   139  						"CF-Cache-Status", resp.Header.Get("CF-Cache-Status"),
   140  					)
   141  				}
   142  			}
   143  			return
   144  		}
   145  		requestParts = append(requestParts, part)
   146  	}
   147  	// Technically what we want to ensure is that all parts exist consecutively. If the file data
   148  	// isn't consecutive, then it is piece aligned and we wouldn't need to be doing multiple
   149  	// requests. TODO: Assert this.
   150  	panicif.Zero(len(requestParts))
   151  	body, w := io.Pipe()
   152  	req := Request{
   153  		ctx:      ctx,
   154  		cancel:   cancel,
   155  		Body:     body,
   156  		bodyPipe: body,
   157  	}
   158  	go ws.requestPartResponsesReader(ctx, w, requestParts)
   159  	return req
   160  }
   161  
   162  // Concatenates request part responses and sends them over the pipe.
   163  func (ws *Client) requestPartResponsesReader(ctx context.Context, w *io.PipeWriter, requestParts []requestPart) {
   164  	pprof.SetGoroutineLabels(context.Background())
   165  	err := ws.readRequestPartResponses(ctx, w, requestParts)
   166  	panicif.Err(w.CloseWithError(err))
   167  }
   168  
   169  type ErrStatusOkForRangeRequest struct{}
   170  
   171  func (ErrStatusOkForRangeRequest) Error() string {
   172  	return "resp status ok but requested range"
   173  }
   174  
   175  type ErrBadResponse struct {
   176  	Msg      string
   177  	Response *http.Response
   178  }
   179  
   180  func (me ErrBadResponse) Error() string {
   181  	return me.Msg
   182  }
   183  
   184  // Warn about bad content-lengths.
   185  func (me *Client) checkContentLength(resp *http.Response, part requestPart, expectedLen int64) {
   186  	if resp.ContentLength == -1 {
   187  		return
   188  	}
   189  	switch resp.Header.Get("Content-Encoding") {
   190  	case "identity", "":
   191  	default:
   192  		return
   193  	}
   194  	if resp.ContentLength != expectedLen {
   195  		me.Logger.Warn("unexpected identity response Content-Length value",
   196  			"actual", resp.ContentLength,
   197  			"expected", expectedLen,
   198  			"url", part.req.URL.String())
   199  	}
   200  }
   201  
   202  var bufPool = &sync.Pool{New: func() any {
   203  	return g.PtrTo(make([]byte, 128<<10)) // 128 KiB. 4x the default.
   204  }}
   205  
   206  // Reads the part in full. All expected bytes must be returned or there will an error returned.
   207  func (me *Client) recvPartResult(ctx context.Context, w io.Writer, part requestPart, resp *http.Response) error {
   208  	defer resp.Body.Close()
   209  	var body io.Reader = resp.Body
   210  	if a := me.ResponseBodyWrapper; a != nil {
   211  		body = a(body, func() { panicif.Err(resp.Body.Close()) })
   212  	}
   213  	// We did set resp.Body to nil here, but I'm worried the HTTP machinery might do something
   214  	// funny.
   215  	if ctx.Err() != nil {
   216  		return context.Cause(ctx)
   217  	}
   218  	switch resp.StatusCode {
   219  	case http.StatusPartialContent:
   220  		// The response should be just as long as we requested.
   221  		me.checkContentLength(resp, part, part.fileRange.Length)
   222  		buf := bufPool.Get().(*[]byte)
   223  		defer bufPool.Put(buf)
   224  		copied, err := io.CopyBuffer(w, body, *buf)
   225  		if err != nil {
   226  			return err
   227  		}
   228  		if copied != part.fileRange.Length {
   229  			return fmt.Errorf("got %v bytes, expected %v", copied, part.fileRange.Length)
   230  		}
   231  		return nil
   232  	case http.StatusOK:
   233  		// The response is from the beginning of the file.
   234  		me.checkContentLength(resp, part, part.fileLength)
   235  		discard := part.fileRange.Start
   236  		if discard != 0 {
   237  			me.Logger.Debug("resp status ok but requested range",
   238  				"url", part.req.URL.String(),
   239  				"range", part.req.Header.Get("Range"))
   240  		}
   241  		if discard > MaxDiscardBytes {
   242  			// TODO: So I think this can happen if the webseed host is caching and needs to pull
   243  			// from the origin. If you try again later it will probably work.
   244  			return ErrStatusOkForRangeRequest{}
   245  		}
   246  		// Instead of discarding, we could try receiving all the chunks present in the response
   247  		// body. I don't know how one would handle multiple chunk requests resulting in an OK
   248  		// response for the same file. The request algorithm might be need to be smarter for that.
   249  		discarded, err := io.CopyN(io.Discard, body, discard)
   250  		if err != nil {
   251  			return fmt.Errorf("error discarding bytes from http ok response: %w", err)
   252  		}
   253  		panicif.NotEq(discarded, discard)
   254  		// Because the reply is not a partial aware response, we limit the body reader
   255  		// intentionally.
   256  		_, err = io.CopyN(w, body, part.fileRange.Length)
   257  		return err
   258  	case http.StatusServiceUnavailable:
   259  		// TODO: Include all of Erigon's cases here?
   260  		return ErrTooFast
   261  	default:
   262  		// TODO: Could we have a slog.Valuer or something to allow callers to unpack reasonable values?
   263  		return ErrBadResponse{
   264  			fmt.Sprintf("unhandled response status code (%v)", resp.Status),
   265  			resp,
   266  		}
   267  	}
   268  }
   269  
   270  var ErrTooFast = errors.New("making requests too fast")
   271  
   272  // Contains info for callers to act (like ignoring particular files or rate limiting).
   273  type ReadRequestPartError struct {
   274  	FileIndex int
   275  	Err       error
   276  }
   277  
   278  func (me ReadRequestPartError) Unwrap() error {
   279  	return me.Err
   280  }
   281  
   282  func (r ReadRequestPartError) Error() string {
   283  	return fmt.Sprintf("reading request part for file index %v: %v", r.FileIndex, r.Err)
   284  }
   285  
   286  func (me *Client) readRequestPartResponses(ctx context.Context, w io.Writer, parts []requestPart) (err error) {
   287  	for _, part := range parts {
   288  		var resp *http.Response
   289  		resp, err = part.do()
   290  		// TODO: Does debugging caching belong here?
   291  		if err == nil {
   292  			err = me.recvPartResult(ctx, w, part, resp)
   293  		}
   294  		if err != nil {
   295  			err = fmt.Errorf("reading %q at %q: %w", part.req.URL, part.req.Header.Get("Range"), err)
   296  			err = ReadRequestPartError{
   297  				FileIndex: part.fileIndex,
   298  				Err:       err,
   299  			}
   300  			break
   301  		}
   302  	}
   303  	return
   304  }