github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/xact/xs/blob_download.go (about) 1 // Package xs is a collection of eXtended actions (xactions), including multi-object 2 // operations, list-objects, (cluster) rebalance and (target) resilver, ETL, and more. 3 /* 4 * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 5 */ 6 package xs 7 8 import ( 9 "context" 10 "errors" 11 "fmt" 12 "io" 13 "net/http" 14 "os" 15 "strconv" 16 "sync" 17 18 "github.com/NVIDIA/aistore/api/apc" 19 "github.com/NVIDIA/aistore/cmn" 20 "github.com/NVIDIA/aistore/cmn/cos" 21 "github.com/NVIDIA/aistore/cmn/debug" 22 "github.com/NVIDIA/aistore/cmn/feat" 23 "github.com/NVIDIA/aistore/cmn/nlog" 24 "github.com/NVIDIA/aistore/core" 25 "github.com/NVIDIA/aistore/core/meta" 26 "github.com/NVIDIA/aistore/memsys" 27 "github.com/NVIDIA/aistore/xact" 28 "github.com/NVIDIA/aistore/xact/xreg" 29 ) 30 31 // TODO: 32 // 1. load, latest-ver, checksum, write, finalize 33 // 2. track each chunk reader with 'started' timestamp; abort/retry individual chunks; timeout 34 // 3. validate `expCksum` 35 36 // default tunables (can override via apc.BlobMsg) 37 const ( 38 dfltChunkSize = 2 * cos.MiB 39 minChunkSize = memsys.DefaultBufSize 40 maxChunkSize = 16 * cos.MiB 41 dfltNumWorkers = 4 42 43 maxInitialSizeSGL = 128 // vec length 44 maxTotalChunks = 128 * cos.MiB // max mem per blob downloader 45 ) 46 47 type ( 48 XactBlobDl struct { 49 writer io.Writer 50 args *core.BlobParams 51 readers []*blobReader 52 workCh chan chunkWi 53 doneCh chan chunkDone 54 nextRoff int64 55 woff int64 56 xact.Base 57 sgls []*memsys.SGL 58 cksum cos.CksumHash 59 wg sync.WaitGroup 60 // not necessarily equal user-provided apc.BlobMsg values; 61 // in particular, chunk size and num workers might be adjusted based on resources 62 chunkSize int64 63 fullSize int64 64 numWorkers int 65 } 66 ) 67 68 // internal 69 type ( 70 blobReader struct { 71 parent *XactBlobDl 72 } 73 chunkWi struct { 74 sgl *memsys.SGL 75 roff int64 76 } 77 chunkDone struct { 78 err error 79 sgl *memsys.SGL 80 roff int64 81 code int 82 } 83 blobFactory struct { 84 xreg.RenewBase 85 pre *XactBlobDl 86 xctn *XactBlobDl 87 } 88 ) 89 90 // interface guard 91 var ( 92 _ core.Xact = (*XactBlobDl)(nil) 93 _ xreg.Renewable = (*blobFactory)(nil) 94 ) 95 96 // NOTE: to optimize-out additional HEAD request (below), the caller must pass `oa` attrs (just lom is not enough) 97 func RenewBlobDl(xid string, params *core.BlobParams, oa *cmn.ObjAttrs) xreg.RenewRes { 98 var ( 99 lom = params.Lom 100 pre = &XactBlobDl{args: params} // preliminary ("keep filling" below) 101 ) 102 pre.chunkSize = params.Msg.ChunkSize 103 pre.numWorkers = params.Msg.NumWorkers 104 if oa == nil { 105 // backend.HeadObj(), unless already done via prior (e.g. latest-ver or prefetch-threshold) check 106 // (in the latter case, oa.Size must be present) 107 oah, ecode, err := core.T.Backend(lom.Bck()).HeadObj(context.Background(), lom, nil /*origReq*/) 108 if err != nil { 109 return xreg.RenewRes{Err: err} 110 } 111 debug.Assert(ecode == 0) 112 oa = oah 113 } 114 // fill-in custom MD 115 lom.SetCustomMD(oa.CustomMD) 116 lom.SetVersion(oa.Ver) 117 lom.SetAtimeUnix(oa.Atime) 118 // and separately: 119 debug.Assert(oa.Size > 0) 120 pre.fullSize = oa.Size 121 122 if params.Msg.FullSize > 0 && params.Msg.FullSize != pre.fullSize { 123 name := xact.Cname(apc.ActBlobDl, xid) + "/" + lom.Cname() 124 err := fmt.Errorf("%s: user-specified size %d, have %d", name, params.Msg.FullSize, pre.fullSize) 125 return xreg.RenewRes{Err: err} 126 } 127 128 // validate, assign defaults (tune-up below) 129 if pre.chunkSize == 0 { 130 pre.chunkSize = dfltChunkSize 131 } else if pre.chunkSize < minChunkSize { 132 nlog.Infoln("Warning: chunk size", cos.ToSizeIEC(pre.chunkSize, 1), "is below permitted minimum", 133 cos.ToSizeIEC(minChunkSize, 0)) 134 pre.chunkSize = minChunkSize 135 } else if pre.chunkSize > maxChunkSize { 136 nlog.Infoln("Warning: chunk size", cos.ToSizeIEC(pre.chunkSize, 1), "exceeds permitted maximum", 137 cos.ToSizeIEC(maxChunkSize, 0)) 138 pre.chunkSize = maxChunkSize 139 } 140 if pre.numWorkers == 0 { 141 pre.numWorkers = dfltNumWorkers 142 } 143 if int64(pre.numWorkers)*pre.chunkSize > pre.fullSize { 144 pre.numWorkers = int((pre.fullSize + pre.chunkSize - 1) / pre.chunkSize) 145 } 146 if a := cmn.MaxParallelism(); a < pre.numWorkers { 147 pre.numWorkers = a 148 } 149 return xreg.RenewBucketXact(apc.ActBlobDl, lom.Bck(), xreg.Args{UUID: xid, Custom: pre}) 150 } 151 152 // 153 // blobFactory 154 // 155 156 func (*blobFactory) New(args xreg.Args, bck *meta.Bck) xreg.Renewable { 157 debug.Assert(bck.IsRemote()) 158 p := &blobFactory{ 159 RenewBase: xreg.RenewBase{Args: args, Bck: bck}, 160 pre: args.Custom.(*XactBlobDl), 161 } 162 return p 163 } 164 165 func (p *blobFactory) Start() error { 166 // reuse the same args-carrying structure and keep filling-in 167 r := p.pre 168 r.InitBase(p.Args.UUID, p.Kind(), r.args.Lom.Bck()) 169 170 // 2nd (just in time) tune-up 171 var ( 172 mm = core.T.PageMM() 173 slabSize = int64(memsys.MaxPageSlabSize) 174 pressure = mm.Pressure() 175 ) 176 if pressure >= memsys.PressureExtreme { 177 return errors.New(r.Name() + ": extreme memory pressure - not starting") 178 } 179 switch pressure { 180 case memsys.PressureHigh: 181 slabSize = memsys.DefaultBufSize 182 r.numWorkers = 1 183 nlog.Warningln(r.Name() + ": high memory pressure detected...") 184 case memsys.PressureModerate: 185 slabSize >>= 1 186 r.numWorkers = min(3, r.numWorkers) 187 } 188 189 cnt := max((r.chunkSize+slabSize-1)/slabSize, 1) 190 r.chunkSize = min(cnt*slabSize, r.fullSize) 191 192 if cnt > maxInitialSizeSGL { 193 cnt = maxInitialSizeSGL 194 } 195 196 // add a reader, if possible 197 nr := int64(r.numWorkers) 198 if pressure == memsys.PressureLow && r.numWorkers < cmn.MaxParallelism() && 199 nr < (r.fullSize+r.chunkSize-1)/r.chunkSize && 200 nr*r.chunkSize < maxTotalChunks-r.chunkSize { 201 r.numWorkers++ 202 } 203 204 // open channels 205 r.workCh = make(chan chunkWi, r.numWorkers) 206 r.doneCh = make(chan chunkDone, r.numWorkers) 207 208 // init and allocate 209 r.readers = make([]*blobReader, r.numWorkers) 210 r.sgls = make([]*memsys.SGL, r.numWorkers) 211 for i := range r.readers { 212 r.readers[i] = &blobReader{ 213 parent: r, 214 } 215 r.sgls[i] = mm.NewSGL(cnt*slabSize, slabSize) 216 } 217 218 p.xctn = r 219 220 // deliver locally for custom processing 221 if r.args.WriteSGL != nil { 222 return nil 223 } 224 225 // 226 // otherwise (normally), multi-writer that may also include remote send 227 // 228 229 ws := make([]io.Writer, 0, 3) 230 if ty := r.args.Lom.CksumConf().Type; ty != cos.ChecksumNone { 231 r.cksum.Init(ty) 232 ws = append(ws, r.cksum.H) 233 } 234 ws = append(ws, r.args.Lmfh) 235 if r.args.RspW != nil { 236 // and transmit concurrently (alternatively, 237 // could keep writing locally even after GET client goes away) 238 ws = append(ws, r.args.RspW) 239 240 whdr := r.args.RspW.Header() 241 whdr.Set(cos.HdrContentLength, strconv.FormatInt(r.fullSize, 10)) 242 whdr.Set(cos.HdrContentType, cos.ContentBinary) 243 if v, ok := r.args.Lom.GetCustomKey(cmn.ETag); ok { 244 whdr.Set(cos.HdrETag, v) 245 } 246 } 247 r.writer = cos.NewWriterMulti(ws...) 248 return nil 249 } 250 251 func (*blobFactory) Kind() string { return apc.ActBlobDl } 252 func (p *blobFactory) Get() core.Xact { return p.xctn } 253 254 func (p *blobFactory) WhenPrevIsRunning(prev xreg.Renewable) (xreg.WPR, error) { 255 var ( 256 xprev = prev.Get().(*XactBlobDl) 257 lomPrev = xprev.args.Lom 258 xcurr = p.xctn 259 lomCurr = xcurr.args.Lom 260 ) 261 if lomPrev.Bucket().Equal(lomCurr.Bucket()) && lomPrev.ObjName == lomCurr.ObjName { 262 return xreg.WprUse, cmn.NewErrXactUsePrev(prev.Get().String()) 263 } 264 return xreg.WprKeepAndStartNew, nil 265 } 266 267 // 268 // XactBlobDl 269 // 270 271 func (r *XactBlobDl) Name() string { return r.Base.Name() + "/" + r.args.Lom.ObjName } 272 273 func (r *XactBlobDl) Run(*sync.WaitGroup) { 274 var ( 275 err error 276 pending []chunkDone 277 eof bool 278 ) 279 nlog.Infoln(r.Name()+": chunk-size", cos.ToSizeIEC(r.chunkSize, 0)+", num-concurrent-readers", r.numWorkers) 280 r.start() 281 outer: 282 for { 283 select { 284 case done := <-r.doneCh: 285 sgl, sz := done.sgl, done.sgl.Size() 286 if done.code == http.StatusRequestedRangeNotSatisfiable && r.fullSize > done.roff+sz { 287 err = fmt.Errorf("%s: premature eof: expected size %d, have %d", r.Name(), r.fullSize, done.roff+sz) 288 goto fin 289 } 290 if sz > 0 && r.fullSize < done.roff+sz { 291 err = fmt.Errorf("%s: detected size increase during download: expected %d, have (%d + %d)", r.Name(), 292 r.fullSize, done.roff, sz) 293 goto fin 294 } 295 eof = r.fullSize <= done.roff+sz 296 debug.Assert(sz > 0 || eof) 297 298 // add pending in the offset-descending order 299 if done.roff != r.woff { 300 debug.Assert(done.roff > r.woff) 301 debug.Assert((done.roff-r.woff)%r.chunkSize == 0) 302 pending = append(pending, chunkDone{roff: -1}) 303 for i := range pending { 304 if i == len(pending)-1 || (pending[i].roff >= 0 && pending[i].roff < done.roff) { 305 copy(pending[i+1:], pending[i:]) 306 pending[i] = done 307 continue outer 308 } 309 } 310 } 311 // type1 write 312 if err = r.write(sgl); err != nil { 313 goto fin 314 } 315 316 if r.nextRoff < r.fullSize { 317 debug.Assert(sgl.Size() == 0) 318 r.workCh <- chunkWi{sgl, r.nextRoff} 319 r.nextRoff += r.chunkSize 320 } 321 322 // walk backwards and plug any holes 323 for i := len(pending) - 1; i >= 0; i-- { 324 done := pending[i] 325 if done.roff > r.woff { 326 break 327 } 328 debug.Assert(done.roff == r.woff) 329 330 // type2 write: remove from pending and append 331 sgl := done.sgl 332 pending = pending[:i] 333 if err = r.write(sgl); err != nil { 334 goto fin 335 } 336 if r.nextRoff < r.fullSize { 337 debug.Assert(sgl.Size() == 0) 338 r.workCh <- chunkWi{sgl, r.nextRoff} 339 r.nextRoff += r.chunkSize 340 } 341 } 342 if r.woff >= r.fullSize { 343 debug.Assertf(r.woff == r.fullSize, "%d > %d", r.woff, r.fullSize) 344 goto fin 345 } 346 if eof && cmn.Rom.FastV(5, cos.SmoduleXs) { 347 nlog.Errorf("%s eof w/pending: woff=%d, next=%d, size=%d", r.Name(), r.woff, r.nextRoff, r.fullSize) 348 for i := len(pending) - 1; i >= 0; i-- { 349 nlog.Errorf(" roff %d", pending[i].roff) 350 } 351 } 352 case <-r.ChanAbort(): 353 err = cmn.ErrXactUserAbort 354 goto fin 355 } 356 } 357 fin: 358 close(r.workCh) 359 360 if r.args.WriteSGL != nil { 361 errN := r.args.WriteSGL(nil) 362 debug.AssertNoErr(errN) 363 } else { 364 // finalize r.args.Lom 365 if err == nil && r.args.Lom.IsFeatureSet(feat.FsyncPUT) { 366 err = r.args.Lmfh.Sync() 367 } 368 cos.Close(r.args.Lmfh) 369 370 if err == nil { 371 if r.fullSize != r.woff { 372 err = fmt.Errorf("%s: exp size %d != %d off", r.Name(), r.fullSize, r.woff) 373 debug.AssertNoErr(err) 374 } else { 375 r.args.Lom.SetSize(r.woff) 376 if r.cksum.H != nil { 377 r.cksum.Finalize() 378 r.args.Lom.SetCksum(r.cksum.Clone()) 379 } 380 _, err = core.T.FinalizeObj(r.args.Lom, r.args.Wfqn, r, cmn.OwtGetPrefetchLock) 381 } 382 } 383 if err == nil { 384 r.ObjsAdd(1, 0) 385 } else { 386 if errRemove := cos.RemoveFile(r.args.Wfqn); errRemove != nil && !os.IsNotExist(errRemove) { 387 nlog.Errorln("nested err:", errRemove) 388 } 389 if err != cmn.ErrXactUserAbort { 390 r.Abort(err) 391 } 392 } 393 } 394 395 r.wg.Wait() 396 close(r.doneCh) 397 r.cleanup() 398 r.Finish() 399 } 400 401 func (r *XactBlobDl) start() { 402 r.wg.Add(len(r.readers)) 403 for i := range r.readers { 404 go r.readers[i].run() 405 } 406 for i := range r.readers { 407 r.workCh <- chunkWi{r.sgls[i], r.nextRoff} 408 r.nextRoff += r.chunkSize 409 } 410 } 411 412 func (r *XactBlobDl) write(sgl *memsys.SGL) (err error) { 413 var ( 414 written int64 415 size = sgl.Size() 416 ) 417 if r.args.WriteSGL != nil { 418 err = r.args.WriteSGL(sgl) 419 written = sgl.Size() - sgl.Len() 420 } else { 421 written, err = io.Copy(r.writer, sgl) // using sgl.ReadFrom 422 } 423 if err != nil { 424 if cmn.Rom.FastV(4, cos.SmoduleXs) { 425 nlog.Errorf("%s: failed to write (woff=%d, next=%d, sgl-size=%d): %v", 426 r.Name(), r.woff, r.nextRoff, size, err) 427 } 428 return err 429 } 430 debug.Assertf(written == size, "%s: expected written size=%d, got %d (at woff %d)", r.Name(), size, written, r.woff) 431 432 r.woff += size 433 r.ObjsAdd(0, size) 434 sgl.Reset() 435 return nil 436 } 437 438 func (r *XactBlobDl) cleanup() { 439 for i := range r.readers { 440 r.sgls[i].Free() 441 } 442 clear(r.sgls) 443 if r.args.RspW == nil { // not a GET 444 core.FreeLOM(r.args.Lom) 445 } 446 } 447 448 // 449 // blobReader 450 // 451 452 func (reader *blobReader) run() { 453 var ( 454 err error 455 written int64 456 a = reader.parent.args 457 chunkSize = reader.parent.chunkSize 458 ctx = context.Background() 459 ) 460 for { 461 msg, ok := <-reader.parent.workCh 462 if !ok { 463 break 464 } 465 sgl := msg.sgl 466 res := core.T.Backend(a.Lom.Bck()).GetObjReader(ctx, a.Lom, msg.roff, chunkSize) 467 if reader.parent.IsAborted() { 468 break 469 } 470 if res.ErrCode == http.StatusRequestedRangeNotSatisfiable { 471 debug.Assert(res.Size == 0) 472 reader.parent.doneCh <- chunkDone{nil, sgl, msg.roff, http.StatusRequestedRangeNotSatisfiable} 473 break 474 } 475 if err = res.Err; err == nil { 476 written, err = io.Copy(sgl, res.R) 477 } 478 if err != nil { 479 reader.parent.doneCh <- chunkDone{err, sgl, msg.roff, res.ErrCode} 480 break 481 } 482 debug.Assert(res.Size == written, res.Size, " ", written) 483 debug.Assert(sgl.Size() == written, sgl.Size(), " ", written) 484 debug.Assert(sgl.Size() == sgl.Len(), sgl.Size(), " ", sgl.Len()) 485 486 reader.parent.doneCh <- chunkDone{nil, sgl, msg.roff, res.ErrCode} 487 } 488 reader.parent.wg.Done() 489 } 490 491 func (r *XactBlobDl) Snap() (snap *core.Snap) { 492 snap = &core.Snap{} 493 r.ToSnap(snap) 494 495 // HACK shortcut to support progress bar 496 snap.Stats.InBytes = r.fullSize 497 return 498 }