github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/xact/xs/archive.go (about) 1 // Package xs is a collection of eXtended actions (xactions), including multi-object 2 // operations, list-objects, (cluster) rebalance and (target) resilver, ETL, and more. 3 /* 4 * Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. 5 */ 6 package xs 7 8 import ( 9 "archive/tar" 10 "context" 11 "fmt" 12 "io" 13 "net/http" 14 "os" 15 "path/filepath" 16 "sync" 17 "time" 18 19 "github.com/NVIDIA/aistore/api/apc" 20 "github.com/NVIDIA/aistore/cmn" 21 "github.com/NVIDIA/aistore/cmn/archive" 22 "github.com/NVIDIA/aistore/cmn/atomic" 23 "github.com/NVIDIA/aistore/cmn/cos" 24 "github.com/NVIDIA/aistore/cmn/debug" 25 "github.com/NVIDIA/aistore/cmn/nlog" 26 "github.com/NVIDIA/aistore/core" 27 "github.com/NVIDIA/aistore/core/meta" 28 "github.com/NVIDIA/aistore/fs" 29 "github.com/NVIDIA/aistore/transport" 30 "github.com/NVIDIA/aistore/xact" 31 "github.com/NVIDIA/aistore/xact/xreg" 32 ) 33 34 // TODO: 35 // - enable multi-threaded list-range iter (see lrit.init) 36 // - one source multiple destination buckets (feature) 37 38 type ( 39 archFactory struct { 40 streamingF 41 } 42 archwi struct { // archival work item; implements lrwi 43 writer archive.Writer 44 r *XactArch 45 msg *cmn.ArchiveBckMsg 46 tsi *meta.Snode 47 archlom *core.LOM 48 fqn string // workFQN --/-- 49 wfh *os.File // --/-- 50 cksum cos.CksumHashSize 51 cnt atomic.Int32 // num archived 52 // tar only 53 appendPos int64 // append to existing 54 tarFormat tar.Format 55 // finishing 56 refc atomic.Int32 57 } 58 XactArch struct { 59 streamingX 60 workCh chan *cmn.ArchiveBckMsg 61 bckTo *meta.Bck 62 pending struct { 63 m map[string]*archwi 64 sync.RWMutex 65 } 66 } 67 ) 68 69 // interface guard 70 var ( 71 _ core.Xact = (*XactArch)(nil) 72 _ xreg.Renewable = (*archFactory)(nil) 73 _ lrwi = (*archwi)(nil) 74 ) 75 76 ///////////////// 77 // archFactory // 78 ///////////////// 79 80 func (*archFactory) New(args xreg.Args, bck *meta.Bck) xreg.Renewable { 81 p := &archFactory{streamingF: streamingF{RenewBase: xreg.RenewBase{Args: args, Bck: bck}, kind: apc.ActArchive}} 82 return p 83 } 84 85 func (p *archFactory) Start() (err error) { 86 // 87 // target-local generation of a global UUID 88 // 89 bckTo, ok := p.Args.Custom.(*meta.Bck) 90 debug.Assertf(ok, "%+v", bckTo) 91 if !ok || bckTo.IsEmpty() { 92 bckTo = &meta.Bck{Name: "any"} // local usage to gen uuid, see r.bckTo below 93 } 94 p.Args.UUID, err = p.genBEID(p.Bck, bckTo) 95 if err != nil { 96 return err 97 } 98 // 99 // new x-archive 100 // 101 workCh := make(chan *cmn.ArchiveBckMsg, maxNumInParallel) 102 r := &XactArch{streamingX: streamingX{p: &p.streamingF, config: cmn.GCO.Get()}, workCh: workCh} 103 r.pending.m = make(map[string]*archwi, maxNumInParallel) 104 p.xctn = r 105 r.DemandBase.Init(p.UUID() /*== p.Args.UUID above*/, p.kind, p.Bck /*from*/, xact.IdleDefault) 106 107 if err := p.newDM(p.Args.UUID /*trname*/, r.recv, r.config, cmn.OwtPut, 0 /*pdu*/); err != nil { 108 return err 109 } 110 if r.p.dm != nil { 111 r.p.dm.SetXact(r) 112 r.p.dm.Open() 113 } 114 xact.GoRunW(r) 115 return 116 } 117 118 ////////////// 119 // XactArch // 120 ////////////// 121 122 func (r *XactArch) Begin(msg *cmn.ArchiveBckMsg, archlom *core.LOM) (err error) { 123 if err = archlom.InitBck(&msg.ToBck); err != nil { 124 r.AddErr(err, 4, cos.SmoduleXs) 125 return err 126 } 127 debug.Assert(archlom.Cname() == msg.Cname()) // relying on it 128 129 wi := &archwi{r: r, msg: msg, archlom: archlom, tarFormat: tar.FormatUnknown} 130 wi.fqn = fs.CSM.Gen(wi.archlom, fs.WorkfileType, fs.WorkfileCreateArch) 131 wi.cksum.Init(archlom.CksumType()) 132 133 // here and elsewhere: an extra check to make sure this target is active (ref: ignoreMaintenance) 134 smap := core.T.Sowner().Get() 135 if err = core.InMaintOrDecomm(smap, core.T.Snode(), r); err != nil { 136 return 137 } 138 nat := smap.CountActiveTs() 139 wi.refc.Store(int32(nat - 1)) 140 141 wi.tsi, err = smap.HrwName2T(msg.ToBck.MakeUname(msg.ArchName)) 142 if err != nil { 143 r.AddErr(err, 4, cos.SmoduleXs) 144 return 145 } 146 147 // fcreate at BEGIN time 148 if core.T.SID() == wi.tsi.ID() { 149 var ( 150 s string 151 lmfh *os.File 152 finfo, errX = os.Stat(wi.archlom.FQN) 153 exists = errX == nil 154 ) 155 if exists && wi.msg.AppendIfExists { 156 s = " append" 157 lmfh, err = wi.beginAppend() 158 } else { 159 wi.wfh, err = wi.archlom.CreateFile(wi.fqn) 160 } 161 if err != nil { 162 return 163 } 164 if cmn.Rom.FastV(5, cos.SmoduleXs) { 165 nlog.Infof("%s: begin%s %s", r.Base.Name(), s, msg.Cname()) 166 } 167 168 // construct format-specific writer; serialize for multi-target conc. writing 169 opts := archive.Opts{Serialize: nat > 1, TarFormat: wi.tarFormat} 170 wi.writer = archive.NewWriter(msg.Mime, wi.wfh, &wi.cksum, &opts) 171 172 // append case (above) 173 if lmfh != nil { 174 err = wi.writer.Copy(lmfh, finfo.Size()) 175 if err != nil { 176 wi.writer.Fini() 177 wi.cleanup() 178 return 179 } 180 } 181 } 182 183 // most of the time there'll be a single destination bucket for the lifetime 184 if r.bckTo == nil { 185 if from := r.Bck().Bucket(); !from.Equal(&wi.msg.ToBck) { 186 r.bckTo = meta.CloneBck(&wi.msg.ToBck) 187 } 188 } 189 190 r.pending.Lock() 191 r.pending.m[msg.TxnUUID] = wi 192 r.wiCnt.Inc() 193 r.pending.Unlock() 194 return 195 } 196 197 func (r *XactArch) Do(msg *cmn.ArchiveBckMsg) { 198 r.IncPending() 199 r.workCh <- msg 200 } 201 202 func (r *XactArch) Run(wg *sync.WaitGroup) { 203 var err error 204 nlog.Infoln(r.Name()) 205 wg.Done() 206 for { 207 select { 208 case msg := <-r.workCh: 209 r.pending.RLock() 210 wi, ok := r.pending.m[msg.TxnUUID] 211 r.pending.RUnlock() 212 if !ok { 213 debug.Assert(r.ErrCnt() > 0) // see cleanup 214 goto fin 215 } 216 var ( 217 smap = core.T.Sowner().Get() 218 lrit = &lriterator{} 219 ) 220 err = lrit.init(r, &msg.ListRange, r.Bck(), true /*TODO: remove blocking*/) 221 if err != nil { 222 r.Abort(err) 223 goto fin 224 } 225 err = lrit.run(wi, smap) 226 if err != nil { 227 r.AddErr(err) 228 } 229 lrit.wait() 230 if r.Err() != nil { 231 wi.cleanup() 232 goto fin 233 } 234 if core.T.SID() == wi.tsi.ID() { 235 go r.finalize(wi) // async finalize this shard 236 } else { 237 r.sendTerm(wi.msg.TxnUUID, wi.tsi, nil) 238 r.pending.Lock() 239 delete(r.pending.m, msg.TxnUUID) 240 r.wiCnt.Dec() 241 r.pending.Unlock() 242 r.DecPending() 243 244 core.FreeLOM(wi.archlom) 245 } 246 case <-r.IdleTimer(): 247 goto fin 248 case <-r.ChanAbort(): 249 goto fin 250 } 251 } 252 fin: 253 r.streamingX.fin(true /*unreg Rx*/) 254 if r.Err() == nil { 255 return 256 } 257 258 // [cleanup] close and rm unfinished archives (compare w/ finalize) 259 r.pending.Lock() 260 for _, wi := range r.pending.m { 261 wi.cleanup() 262 } 263 clear(r.pending.m) 264 r.pending.Unlock() 265 } 266 267 func (r *XactArch) doSend(lom *core.LOM, wi *archwi, fh cos.ReadOpenCloser) { 268 debug.Assert(r.p.dm != nil) 269 o := transport.AllocSend() 270 hdr := &o.Hdr 271 { 272 hdr.Bck = wi.msg.ToBck 273 hdr.ObjName = lom.ObjName 274 hdr.ObjAttrs.CopyFrom(lom.ObjAttrs(), false /*skip cksum*/) 275 hdr.Opaque = []byte(wi.msg.TxnUUID) 276 } 277 // o.Callback nil on purpose (lom is freed by the iterator) 278 r.p.dm.Send(o, fh, wi.tsi) 279 } 280 281 func (r *XactArch) recv(hdr *transport.ObjHdr, objReader io.Reader, err error) error { 282 if err != nil && !cos.IsEOF(err) { 283 r.AddErr(err, 5, cos.SmoduleXs) 284 return err 285 } 286 287 r.IncPending() 288 err = r._recv(hdr, objReader) 289 r.DecPending() 290 transport.DrainAndFreeReader(objReader) 291 return err 292 } 293 294 func (r *XactArch) _recv(hdr *transport.ObjHdr, objReader io.Reader) error { 295 r.pending.RLock() 296 wi, ok := r.pending.m[cos.UnsafeS(hdr.Opaque)] // txnUUID 297 r.pending.RUnlock() 298 if !ok { 299 if r.Finished() || r.IsAborted() { 300 return nil 301 } 302 cnt, err := r.JoinErr() 303 debug.Assert(cnt > 0) // see cleanup 304 return err 305 } 306 debug.Assert(wi.tsi.ID() == core.T.SID() && wi.msg.TxnUUID == cos.UnsafeS(hdr.Opaque)) 307 308 // NOTE: best-effort via ref-counting 309 if hdr.Opcode == opcodeDone { 310 refc := wi.refc.Dec() 311 debug.Assert(refc >= 0) 312 return nil 313 } 314 315 debug.Assert(hdr.Opcode == 0) 316 err := wi.writer.Write(wi.nameInArch(hdr.ObjName), &hdr.ObjAttrs, objReader) 317 if err == nil { 318 wi.cnt.Inc() 319 } else { 320 r.AddErr(err, 5, cos.SmoduleXs) 321 } 322 return nil 323 } 324 325 // NOTE: in goroutine 326 func (r *XactArch) finalize(wi *archwi) { 327 q := wi.quiesce() 328 if q == core.QuiTimeout { 329 err := fmt.Errorf("%s: %v", r, cmn.ErrQuiesceTimeout) 330 r.AddErr(err, 4, cos.SmoduleXs) 331 } 332 333 r.pending.Lock() 334 delete(r.pending.m, wi.msg.TxnUUID) 335 r.wiCnt.Dec() 336 r.pending.Unlock() 337 338 ecode, err := r.fini(wi) 339 r.DecPending() 340 if cmn.Rom.FastV(5, cos.SmoduleXs) { 341 var s string 342 if err != nil { 343 s = fmt.Sprintf(": %v(%d)", err, ecode) 344 } 345 nlog.Infof("%s: finalize %s%s", r.Base.Name(), wi.msg.Cname(), s) 346 } 347 if err == nil || r.IsAborted() { // done ok (unless aborted) 348 return 349 } 350 debug.Assert(q != core.QuiAborted) 351 352 wi.cleanup() 353 r.AddErr(err, 5, cos.SmoduleXs) 354 } 355 356 func (r *XactArch) fini(wi *archwi) (ecode int, err error) { 357 wi.writer.Fini() 358 359 if r.IsAborted() { 360 wi.cleanup() 361 core.FreeLOM(wi.archlom) 362 return 363 } 364 365 var size int64 366 if wi.cnt.Load() == 0 { 367 s := "empty" 368 if wi.appendPos > 0 { 369 s = "no new appends to" 370 } 371 if cnt, errs := r.JoinErr(); cnt > 0 { 372 err = fmt.Errorf("%s: %s %s, err: %v (cnt=%d)", r, s, wi.archlom, errs, cnt) 373 } else { 374 err = fmt.Errorf("%s: %s %s", r, s, wi.archlom) 375 } 376 } else { 377 size, err = wi.finalize() 378 } 379 if err != nil { 380 wi.cleanup() 381 core.FreeLOM(wi.archlom) 382 ecode = http.StatusInternalServerError 383 return 384 } 385 386 wi.archlom.SetSize(size) 387 cos.Close(wi.wfh) 388 wi.wfh = nil 389 390 ecode, err = core.T.FinalizeObj(wi.archlom, wi.fqn, r, cmn.OwtArchive) 391 core.FreeLOM(wi.archlom) 392 r.ObjsAdd(1, size-wi.appendPos) 393 return 394 } 395 396 func (r *XactArch) Name() (s string) { 397 s = r.streamingX.Name() 398 if src, dst := r.FromTo(); src != nil { 399 s += " => " + dst.String() 400 } 401 return 402 } 403 404 func (r *XactArch) String() (s string) { 405 s = r.streamingX.String() + " => " 406 if r.wiCnt.Load() > 0 && r.bckTo != nil { 407 s += r.bckTo.String() 408 } 409 return 410 } 411 412 func (r *XactArch) FromTo() (src, dst *meta.Bck) { 413 if r.bckTo != nil { 414 src, dst = r.Bck(), r.bckTo 415 } 416 return 417 } 418 419 func (r *XactArch) Snap() (snap *core.Snap) { 420 snap = &core.Snap{} 421 r.ToSnap(snap) 422 423 snap.IdleX = r.IsIdle() 424 if f, t := r.FromTo(); f != nil { 425 snap.SrcBck, snap.DstBck = f.Clone(), t.Clone() 426 } 427 return 428 } 429 430 //////////// 431 // archwi // 432 //////////// 433 434 func (wi *archwi) beginAppend() (lmfh *os.File, err error) { 435 msg := wi.msg 436 if msg.Mime == archive.ExtTar { 437 if err = wi.openTarForAppend(); err == nil || err != archive.ErrTarIsEmpty { 438 return 439 } 440 } 441 // msg.Mime has been already validated (see ais/* for apc.ActArchive) 442 // prep to copy `lmfh` --> `wi.fh` with subsequent APPEND-ing 443 lmfh, err = wi.archlom.OpenFile() 444 if err != nil { 445 return 446 } 447 if wi.wfh, err = wi.archlom.CreateFile(wi.fqn); err != nil { 448 cos.Close(lmfh) 449 lmfh = nil 450 } 451 return 452 } 453 454 func (wi *archwi) openTarForAppend() (err error) { 455 if err = os.Rename(wi.archlom.FQN, wi.fqn); err != nil { 456 return 457 } 458 // open (rw) lom itself 459 wi.wfh, wi.tarFormat, err = archive.OpenTarSeekEnd(wi.archlom.ObjName, wi.fqn) 460 if err != nil { 461 goto roll 462 } 463 wi.appendPos, err = wi.wfh.Seek(0, io.SeekCurrent) 464 if err == nil { 465 return // can append 466 } 467 wi.appendPos, wi.tarFormat = 0, tar.FormatUnknown // reset 468 cos.Close(wi.wfh) 469 wi.wfh = nil 470 roll: 471 if errV := wi.archlom.RenameFrom(wi.fqn); errV != nil { 472 nlog.Errorf("%s: nested error: failed to append %s (%v) and rename back from %s (%v)", 473 wi.tsi, wi.archlom, err, wi.fqn, errV) 474 } else { 475 wi.fqn = "" 476 } 477 return 478 } 479 480 // multi-object iterator i/f: "handle work item" 481 func (wi *archwi) do(lom *core.LOM, lrit *lriterator) { 482 var coldGet bool 483 if err := lom.Load(false /*cache it*/, false /*locked*/); err != nil { 484 if !cos.IsNotExist(err, 0) { 485 wi.r.AddErr(err, 5, cos.SmoduleXs) 486 return 487 } 488 if coldGet = lom.Bck().IsRemote(); !coldGet { 489 if lrit.lrp == lrpList { 490 // listed, not found 491 wi.r.AddErr(err, 5, cos.SmoduleXs) 492 } 493 return 494 } 495 } 496 497 if coldGet { 498 // cold 499 if ecode, err := core.T.GetCold(context.Background(), lom, cmn.OwtGetLock); err != nil { 500 if lrit.lrp != lrpList && cos.IsNotExist(err, ecode) { 501 return // range or prefix, not found 502 } 503 wi.r.AddErr(err, 5, cos.SmoduleXs) 504 return 505 } 506 } 507 508 fh, err := cos.NewFileHandle(lom.FQN) 509 if err != nil { 510 wi.r.AddErr(err, 5, cos.SmoduleXs) 511 return 512 } 513 if core.T.SID() != wi.tsi.ID() { 514 wi.r.doSend(lom, wi, fh) 515 return 516 } 517 debug.Assert(wi.wfh != nil) // see Begin 518 err = wi.writer.Write(wi.nameInArch(lom.ObjName), lom, fh /*reader*/) 519 cos.Close(fh) 520 if err == nil { 521 wi.cnt.Inc() 522 } else { 523 wi.r.AddErr(err, 5, cos.SmoduleXs) 524 } 525 } 526 527 func (wi *archwi) quiesce() core.QuiRes { 528 timeout := cmn.Rom.CplaneOperation() 529 return wi.r.Quiesce(timeout, func(total time.Duration) core.QuiRes { 530 if wi.refc.Load() == 0 && wi.r.wiCnt.Load() == 1 /*the last wi (so far) about to `fini`*/ { 531 return core.QuiDone 532 } 533 return xact.RefcntQuiCB(&wi.refc, wi.r.config.Timeout.SendFile.D()/2, total) 534 }) 535 } 536 537 func (wi *archwi) nameInArch(objName string) string { 538 if !wi.msg.InclSrcBname { 539 return objName 540 } 541 buf := make([]byte, 0, len(wi.msg.FromBckName)+1+len(objName)) 542 buf = append(buf, wi.msg.FromBckName...) 543 buf = append(buf, filepath.Separator) 544 buf = append(buf, objName...) 545 return cos.UnsafeS(buf) 546 } 547 548 func (wi *archwi) cleanup() { 549 if wi.wfh != nil { 550 cos.Close(wi.wfh) 551 wi.wfh = nil 552 } 553 if wi.fqn != "" { 554 if wi.archlom == nil || wi.archlom.FQN != wi.fqn { 555 cos.RemoveFile(wi.fqn) 556 } 557 wi.fqn = "" 558 } 559 } 560 561 func (wi *archwi) finalize() (int64, error) { 562 if wi.appendPos > 0 { 563 size, err := wi.wfh.Seek(0, io.SeekCurrent) 564 if err != nil { 565 return 0, err 566 } 567 debug.Assertf(size > wi.appendPos, "%d vs %d", size, wi.appendPos) 568 // checksum traded off 569 wi.archlom.SetCksum(cos.NewCksum(cos.ChecksumNone, "")) 570 return size, nil 571 } 572 wi.cksum.Finalize() 573 wi.archlom.SetCksum(&wi.cksum.Cksum) 574 return wi.cksum.Size, nil 575 }