github.com/anacrolix/torrent@v1.61.0/webseed/client.go (about) 1 package webseed 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "io" 8 "log/slog" 9 "net/http" 10 "os" 11 "runtime/pprof" 12 "strings" 13 "sync" 14 15 "github.com/RoaringBitmap/roaring" 16 g "github.com/anacrolix/generics" 17 "github.com/anacrolix/missinggo/v2/panicif" 18 "github.com/dustin/go-humanize" 19 "golang.org/x/time/rate" 20 21 "github.com/anacrolix/torrent/metainfo" 22 "github.com/anacrolix/torrent/segments" 23 ) 24 25 // How many consecutive bytes to allow discarding from responses. This number is based on 26 // https://archive.org/download/BloodyPitOfHorror/BloodyPitOfHorror.asr.srt. It seems that 27 // archive.org might be using a webserver implementation that refuses to do partial responses to 28 // small files. TODO: Make this configurable. 29 const MaxDiscardBytes = 48 << 10 30 31 // Output debug information to stdout. 32 var PrintDebug = false 33 34 func init() { 35 _, PrintDebug = os.LookupEnv("TORRENT_WEBSEED_DEBUG") 36 } 37 38 type RequestSpec = segments.Extent 39 40 type requestPart struct { 41 req *http.Request 42 fileRange segments.Extent 43 fileLength int64 44 do func() (*http.Response, error) 45 fileIndex int 46 } 47 48 type Request struct { 49 // So you can view it from externally. 50 ctx context.Context 51 cancel context.CancelCauseFunc 52 Body io.Reader 53 // Closed with error to unstick copy routine when context isn't checked. 54 bodyPipe *io.PipeReader 55 } 56 57 func (r *Request) Context() context.Context { 58 return r.ctx 59 } 60 61 func (r *Request) Cancel(cause error) { 62 r.cancel(cause) 63 } 64 65 func (r *Request) Close() { 66 // We aren't cancelling because we want to know if we can keep receiving buffered data after 67 // cancellation. PipeReader.Close always returns nil. 68 _ = r.bodyPipe.Close() 69 } 70 71 type Client struct { 72 Logger *slog.Logger 73 HttpClient *http.Client 74 Url string 75 // Max concurrent requests to a WebSeed for a given torrent. TODO: Unused. 76 MaxRequests int 77 78 fileIndex *segments.Index 79 info *metainfo.Info 80 // The pieces we can request with the Url. We're more likely to ban/block at the file-level 81 // given that's how requests are mapped to webseeds, but the torrent.Client works at the piece 82 // level. We can map our file-level adjustments to the pieces here. This probably need to be 83 // private in the future, if Client ever starts removing pieces. TODO: This belongs in 84 // webseedPeer. TODO: Unused. 85 Pieces roaring.Bitmap 86 // This wraps http.Response bodies, for example to limit the download rate. 87 ResponseBodyWrapper ResponseBodyWrapper 88 ResponseBodyRateLimiter *rate.Limiter 89 PathEscaper PathEscaper 90 } 91 92 type ResponseBodyWrapper func(r io.Reader, interrupt func()) io.Reader 93 94 func (me *Client) SetInfo(info *metainfo.Info, fileIndex *segments.Index) { 95 if !strings.HasSuffix(me.Url, "/") && info.IsDir() { 96 // In my experience, this is a non-conforming webseed. For example the 97 // http://ia600500.us.archive.org/1/items URLs in archive.org torrents. 98 me.Logger.Warn("webseed URL does not end with / and torrent is a directory") 99 return 100 } 101 me.fileIndex = fileIndex 102 me.info = info 103 me.Pieces.AddRange(0, uint64(info.NumPieces())) 104 } 105 106 // Returns the URL for the given file index. This is assumed to be globally unique. 107 func (ws *Client) UrlForFileIndex(fileIndex int) string { 108 return urlForFileIndex(ws.Url, fileIndex, ws.info, ws.PathEscaper) 109 } 110 111 func (ws *Client) StartNewRequest(ctx context.Context, r RequestSpec, debugLogger *slog.Logger) Request { 112 ctx, cancel := context.WithCancelCause(ctx) 113 var requestParts []requestPart 114 panicif.Nil(ws.fileIndex) 115 panicif.Nil(ws.info) 116 for i, e := range ws.fileIndex.LocateIter(r) { 117 req, err := newRequest( 118 ctx, 119 ws.Url, i, ws.info, e.Start, e.Length, 120 ws.PathEscaper, 121 ) 122 panicif.Err(err) 123 part := requestPart{ 124 req: req, 125 fileRange: e, 126 fileLength: ws.fileIndex.Index(i).Length, 127 fileIndex: i, 128 } 129 part.do = func() (resp *http.Response, err error) { 130 resp, err = ws.HttpClient.Do(req) 131 if PrintDebug { 132 if err == nil { 133 debugLogger.Debug( 134 "request for part", 135 "url", req.URL, 136 "part-length", humanize.IBytes(uint64(e.Length)), 137 "part-file-offset", humanize.IBytes(uint64(e.Start)), 138 "file-length", humanize.IBytes(uint64(part.fileLength)), 139 "CF-Cache-Status", resp.Header.Get("CF-Cache-Status"), 140 ) 141 } 142 } 143 return 144 } 145 requestParts = append(requestParts, part) 146 } 147 // Technically what we want to ensure is that all parts exist consecutively. If the file data 148 // isn't consecutive, then it is piece aligned and we wouldn't need to be doing multiple 149 // requests. TODO: Assert this. 150 panicif.Zero(len(requestParts)) 151 body, w := io.Pipe() 152 req := Request{ 153 ctx: ctx, 154 cancel: cancel, 155 Body: body, 156 bodyPipe: body, 157 } 158 go ws.requestPartResponsesReader(ctx, w, requestParts) 159 return req 160 } 161 162 // Concatenates request part responses and sends them over the pipe. 163 func (ws *Client) requestPartResponsesReader(ctx context.Context, w *io.PipeWriter, requestParts []requestPart) { 164 pprof.SetGoroutineLabels(context.Background()) 165 err := ws.readRequestPartResponses(ctx, w, requestParts) 166 panicif.Err(w.CloseWithError(err)) 167 } 168 169 type ErrStatusOkForRangeRequest struct{} 170 171 func (ErrStatusOkForRangeRequest) Error() string { 172 return "resp status ok but requested range" 173 } 174 175 type ErrBadResponse struct { 176 Msg string 177 Response *http.Response 178 } 179 180 func (me ErrBadResponse) Error() string { 181 return me.Msg 182 } 183 184 // Warn about bad content-lengths. 185 func (me *Client) checkContentLength(resp *http.Response, part requestPart, expectedLen int64) { 186 if resp.ContentLength == -1 { 187 return 188 } 189 switch resp.Header.Get("Content-Encoding") { 190 case "identity", "": 191 default: 192 return 193 } 194 if resp.ContentLength != expectedLen { 195 me.Logger.Warn("unexpected identity response Content-Length value", 196 "actual", resp.ContentLength, 197 "expected", expectedLen, 198 "url", part.req.URL.String()) 199 } 200 } 201 202 var bufPool = &sync.Pool{New: func() any { 203 return g.PtrTo(make([]byte, 128<<10)) // 128 KiB. 4x the default. 204 }} 205 206 // Reads the part in full. All expected bytes must be returned or there will an error returned. 207 func (me *Client) recvPartResult(ctx context.Context, w io.Writer, part requestPart, resp *http.Response) error { 208 defer resp.Body.Close() 209 var body io.Reader = resp.Body 210 if a := me.ResponseBodyWrapper; a != nil { 211 body = a(body, func() { panicif.Err(resp.Body.Close()) }) 212 } 213 // We did set resp.Body to nil here, but I'm worried the HTTP machinery might do something 214 // funny. 215 if ctx.Err() != nil { 216 return context.Cause(ctx) 217 } 218 switch resp.StatusCode { 219 case http.StatusPartialContent: 220 // The response should be just as long as we requested. 221 me.checkContentLength(resp, part, part.fileRange.Length) 222 buf := bufPool.Get().(*[]byte) 223 defer bufPool.Put(buf) 224 copied, err := io.CopyBuffer(w, body, *buf) 225 if err != nil { 226 return err 227 } 228 if copied != part.fileRange.Length { 229 return fmt.Errorf("got %v bytes, expected %v", copied, part.fileRange.Length) 230 } 231 return nil 232 case http.StatusOK: 233 // The response is from the beginning of the file. 234 me.checkContentLength(resp, part, part.fileLength) 235 discard := part.fileRange.Start 236 if discard != 0 { 237 me.Logger.Debug("resp status ok but requested range", 238 "url", part.req.URL.String(), 239 "range", part.req.Header.Get("Range")) 240 } 241 if discard > MaxDiscardBytes { 242 // TODO: So I think this can happen if the webseed host is caching and needs to pull 243 // from the origin. If you try again later it will probably work. 244 return ErrStatusOkForRangeRequest{} 245 } 246 // Instead of discarding, we could try receiving all the chunks present in the response 247 // body. I don't know how one would handle multiple chunk requests resulting in an OK 248 // response for the same file. The request algorithm might be need to be smarter for that. 249 discarded, err := io.CopyN(io.Discard, body, discard) 250 if err != nil { 251 return fmt.Errorf("error discarding bytes from http ok response: %w", err) 252 } 253 panicif.NotEq(discarded, discard) 254 // Because the reply is not a partial aware response, we limit the body reader 255 // intentionally. 256 _, err = io.CopyN(w, body, part.fileRange.Length) 257 return err 258 case http.StatusServiceUnavailable: 259 // TODO: Include all of Erigon's cases here? 260 return ErrTooFast 261 default: 262 // TODO: Could we have a slog.Valuer or something to allow callers to unpack reasonable values? 263 return ErrBadResponse{ 264 fmt.Sprintf("unhandled response status code (%v)", resp.Status), 265 resp, 266 } 267 } 268 } 269 270 var ErrTooFast = errors.New("making requests too fast") 271 272 // Contains info for callers to act (like ignoring particular files or rate limiting). 273 type ReadRequestPartError struct { 274 FileIndex int 275 Err error 276 } 277 278 func (me ReadRequestPartError) Unwrap() error { 279 return me.Err 280 } 281 282 func (r ReadRequestPartError) Error() string { 283 return fmt.Sprintf("reading request part for file index %v: %v", r.FileIndex, r.Err) 284 } 285 286 func (me *Client) readRequestPartResponses(ctx context.Context, w io.Writer, parts []requestPart) (err error) { 287 for _, part := range parts { 288 var resp *http.Response 289 resp, err = part.do() 290 // TODO: Does debugging caching belong here? 291 if err == nil { 292 err = me.recvPartResult(ctx, w, part, resp) 293 } 294 if err != nil { 295 err = fmt.Errorf("reading %q at %q: %w", part.req.URL, part.req.Header.Get("Range"), err) 296 err = ReadRequestPartError{ 297 FileIndex: part.fileIndex, 298 Err: err, 299 } 300 break 301 } 302 } 303 return 304 }