github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/tgtobj.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "archive/tar" 9 "context" 10 "encoding" 11 "encoding/base64" 12 "errors" 13 "fmt" 14 "io" 15 "net/http" 16 "os" 17 "strconv" 18 "strings" 19 "sync" 20 "time" 21 22 "github.com/NVIDIA/aistore/ais/s3" 23 "github.com/NVIDIA/aistore/api/apc" 24 "github.com/NVIDIA/aistore/cmn" 25 "github.com/NVIDIA/aistore/cmn/archive" 26 "github.com/NVIDIA/aistore/cmn/cos" 27 "github.com/NVIDIA/aistore/cmn/debug" 28 "github.com/NVIDIA/aistore/cmn/feat" 29 "github.com/NVIDIA/aistore/cmn/mono" 30 "github.com/NVIDIA/aistore/cmn/nlog" 31 "github.com/NVIDIA/aistore/core" 32 "github.com/NVIDIA/aistore/core/meta" 33 "github.com/NVIDIA/aistore/ec" 34 "github.com/NVIDIA/aistore/fs" 35 "github.com/NVIDIA/aistore/memsys" 36 "github.com/NVIDIA/aistore/mirror" 37 "github.com/NVIDIA/aistore/reb" 38 "github.com/NVIDIA/aistore/stats" 39 "github.com/NVIDIA/aistore/transport" 40 "github.com/NVIDIA/aistore/transport/bundle" 41 "github.com/NVIDIA/aistore/xact/xreg" 42 ) 43 44 // 45 // PUT, GET, APPEND (to file | to archive), and COPY object 46 // 47 48 type ( 49 putOI struct { 50 oreq *http.Request 51 r io.ReadCloser // content reader 52 xctn core.Xact // xaction that puts 53 t *target // this 54 lom *core.LOM // obj 55 cksumToUse *cos.Cksum // if available (not `none`), can be validated and will be stored 56 config *cmn.Config // (during this request) 57 resphdr http.Header // as implied 58 workFQN string // temp fqn to be renamed 59 atime int64 // access time.Now() 60 ltime int64 // mono.NanoTime, to measure latency 61 size int64 // aka Content-Length 62 owt cmn.OWT // object write transaction enum { OwtPut, ..., OwtGet* } 63 restful bool // being invoked via RESTful API 64 t2t bool // by another target 65 skipEC bool // do not erasure-encode when finalizing 66 skipVC bool // skip loading existing Version and skip comparing Checksums (skip VC) 67 coldGET bool // (one implication: proceed to write) 68 } 69 70 getOI struct { 71 req *http.Request 72 w http.ResponseWriter 73 ctx context.Context // context used when getting object from remote backend (access creds) 74 t *target // this 75 lom *core.LOM // obj 76 dpq *dpq 77 ranges byteRanges // range read (see https://www.rfc-editor.org/rfc/rfc7233#section-2.1) 78 atime int64 // access time.Now() 79 ltime int64 // mono.NanoTime, to measure latency 80 chunked bool // chunked transfer (en)coding: https://tools.ietf.org/html/rfc7230#page-36 81 unlocked bool // internal 82 verchanged bool // version changed 83 retry bool // once 84 cold bool // true if executed backend.Get 85 latestVer bool // QparamLatestVer || 'versioning.*_warm_get' 86 } 87 88 // textbook append: (packed) handle and control structure (see also `putA2I` arch below) 89 aoHdl struct { 90 partialCksum *cos.CksumHash 91 nodeID string 92 workFQN string 93 } 94 apndOI struct { 95 started int64 // start time (nanoseconds) 96 r io.ReadCloser // content reader 97 t *target // this 98 config *cmn.Config // (during this request) 99 lom *core.LOM // append to or _as_ 100 cksum *cos.Cksum // checksum expected once Flush-ed 101 hdl aoHdl // (packed) 102 op string // enum {apc.AppendOp, apc.FlushOp} 103 size int64 // Content-Length 104 } 105 106 copyOI core.CopyParams 107 108 sendArgs struct { 109 reader cos.ReadOpenCloser 110 dm *bundle.DataMover 111 objAttrs cos.OAH 112 tsi *meta.Snode 113 bckTo *meta.Bck 114 objNameTo string 115 owt cmn.OWT 116 } 117 118 // put/append-to arch 119 putA2I struct { 120 r io.ReadCloser // read bytes to append 121 t *target // this 122 lom *core.LOM // resulting shard 123 filename string // fqn inside 124 mime string // format 125 started int64 // time of receiving 126 size int64 // aka Content-Length 127 put bool // overwrite 128 } 129 ) 130 131 // 132 // PUT(object) 133 // 134 135 // poi.restful entry point 136 func (poi *putOI) do(resphdr http.Header, r *http.Request, dpq *dpq) (int, error) { 137 { 138 poi.oreq = r 139 poi.r = r.Body 140 poi.resphdr = resphdr 141 poi.workFQN = fs.CSM.Gen(poi.lom, fs.WorkfileType, fs.WorkfilePut) 142 poi.cksumToUse = poi.lom.ObjAttrs().FromHeader(r.Header) 143 poi.owt = cmn.OwtPut // default 144 } 145 if dpq.owt != "" { 146 poi.owt.FromS(dpq.owt) 147 } 148 if dpq.uuid != "" { 149 // resolve cluster-wide xact "behind" this PUT (promote via a single target won't show up) 150 xctn, err := xreg.GetXact(dpq.uuid) 151 if err != nil { 152 nlog.Errorln(err) 153 return 0, err 154 } 155 if xctn != nil { 156 poi.xctn = xctn 157 } 158 } 159 if sizeStr := r.Header.Get(cos.HdrContentLength); sizeStr != "" { 160 if size, ers := strconv.ParseInt(sizeStr, 10, 64); ers == nil { 161 poi.size = size 162 } 163 } 164 return poi.putObject() 165 } 166 167 func (poi *putOI) putObject() (ecode int, err error) { 168 poi.ltime = mono.NanoTime() 169 // PUT is a no-op if the checksums do match 170 if !poi.skipVC && !poi.coldGET && !poi.cksumToUse.IsEmpty() { 171 if poi.lom.EqCksum(poi.cksumToUse) { 172 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 173 nlog.Infof("destination %s has identical %s: PUT is a no-op", poi.lom, poi.cksumToUse) 174 } 175 cos.DrainReader(poi.r) 176 return 0, nil 177 } 178 } 179 180 buf, slab, lmfh, erw := poi.write() 181 poi._cleanup(buf, slab, lmfh, erw) 182 if erw != nil { 183 err, ecode = erw, http.StatusInternalServerError 184 goto rerr 185 } 186 187 if ecode, err = poi.finalize(); err != nil { 188 goto rerr 189 } 190 191 // resp. header & stats 192 if !poi.t2t { 193 // NOTE: counting only user PUTs; ignoring EC and copies, on the one hand, and 194 // same-checksum-skip-writing, on the other 195 if poi.owt == cmn.OwtPut && poi.restful { 196 debug.Assert(cos.IsValidAtime(poi.atime), poi.atime) 197 size := poi.lom.SizeBytes() 198 poi.t.statsT.AddMany( 199 cos.NamedVal64{Name: stats.PutCount, Value: 1}, 200 cos.NamedVal64{Name: stats.PutSize, Value: size}, 201 cos.NamedVal64{Name: stats.PutThroughput, Value: size}, 202 cos.NamedVal64{Name: stats.PutLatency, Value: mono.SinceNano(poi.ltime)}, 203 ) 204 // RESTful PUT response header 205 if poi.resphdr != nil { 206 cmn.ToHeader(poi.lom.ObjAttrs(), poi.resphdr, 0 /*skip setting content-length*/) 207 } 208 } 209 } else if poi.xctn != nil && poi.owt == cmn.OwtPromote { 210 // xaction in-objs counters, promote first 211 poi.xctn.InObjsAdd(1, poi.lom.SizeBytes()) 212 } 213 if cmn.Rom.FastV(5, cos.SmoduleAIS) { 214 nlog.Infoln(poi.loghdr()) 215 } 216 return 217 rerr: 218 if poi.owt == cmn.OwtPut && poi.restful && !poi.t2t { 219 poi.t.statsT.IncErr(stats.PutCount) 220 } 221 return 222 } 223 224 // verbose only 225 func (poi *putOI) loghdr() string { 226 sb := strings.Builder{} 227 sb.WriteString(poi.owt.String()) 228 sb.WriteString(", ") 229 sb.WriteString(poi.lom.String()) 230 if poi.xctn != nil { 231 sb.WriteString(", ") 232 sb.WriteString(poi.xctn.String()) 233 } 234 if poi.skipVC { 235 sb.WriteString(", skip-vc") 236 } 237 if poi.coldGET { 238 sb.WriteString(", cold-get") 239 } 240 if poi.t2t { 241 sb.WriteString(", t2t") 242 } 243 return sb.String() 244 } 245 246 func (poi *putOI) finalize() (ecode int, err error) { 247 if ecode, err = poi.fini(); err != nil { 248 if err1 := cos.Stat(poi.workFQN); err1 == nil || !os.IsNotExist(err1) { 249 if err1 == nil { 250 err1 = err 251 } 252 poi.t.fsErr(err1, poi.workFQN) 253 if err2 := cos.RemoveFile(poi.workFQN); err2 != nil && !os.IsNotExist(err2) { 254 nlog.Errorf(fmtNested, poi.t, err1, "remove", poi.workFQN, err2) 255 } 256 } 257 poi.lom.Uncache() 258 if ecode != http.StatusInsufficientStorage && cmn.IsErrCapExceeded(err) { 259 ecode = http.StatusInsufficientStorage 260 } 261 return ecode, err 262 } 263 if !poi.skipEC { 264 if ecErr := ec.ECM.EncodeObject(poi.lom, nil); ecErr != nil && ecErr != ec.ErrorECDisabled { 265 err = ecErr 266 if ecode != http.StatusInsufficientStorage && cmn.IsErrCapExceeded(err) { 267 ecode = http.StatusInsufficientStorage 268 } 269 return ecode, err 270 } 271 } 272 poi.t.putMirror(poi.lom) 273 return 0, nil 274 } 275 276 // poi.workFQN => LOM 277 func (poi *putOI) fini() (ecode int, err error) { 278 var ( 279 lom = poi.lom 280 bck = lom.Bck() 281 ) 282 // put remote 283 if bck.IsRemote() && poi.owt < cmn.OwtRebalance { 284 ecode, err = poi.putRemote() 285 if err != nil { 286 loghdr := poi.loghdr() 287 nlog.Errorf("PUT (%s): %v(%d)", loghdr, err, ecode) 288 if ecode != http.StatusServiceUnavailable { 289 return 290 } 291 // (googleapi: "Error 503: We encountered an internal error. Please try again.") 292 time.Sleep(time.Second) 293 ecode, err = poi.putRemote() 294 if err != nil { 295 return 296 } 297 nlog.Infof("PUT (%s): retried OK", loghdr) 298 } 299 } 300 301 // locking strategies: optimistic and otherwise 302 // (see GetCold() implementation and cmn.OWT enum) 303 switch poi.owt { 304 case cmn.OwtGetTryLock, cmn.OwtGetLock, cmn.OwtGet: 305 debug.AssertFunc(func() bool { _, exclusive := lom.IsLocked(); return exclusive }) 306 case cmn.OwtGetPrefetchLock: 307 if !lom.TryLock(true) { 308 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 309 nlog.Warningln(poi.loghdr(), "is busy") 310 } 311 return 0, cmn.ErrSkip // e.g. prefetch can skip it and keep on going 312 } 313 defer lom.Unlock(true) 314 default: 315 // expecting valid atime passed with `poi` 316 debug.Assert(cos.IsValidAtime(poi.atime), poi.atime) 317 lom.Lock(true) 318 defer lom.Unlock(true) 319 lom.SetAtimeUnix(poi.atime) 320 } 321 322 // ais versioning 323 if bck.IsAIS() && lom.VersionConf().Enabled { 324 if poi.owt < cmn.OwtRebalance { 325 if poi.skipVC { 326 err = lom.IncVersion() 327 debug.AssertNoErr(err) 328 } else if remSrc, ok := lom.GetCustomKey(cmn.SourceObjMD); !ok || remSrc == "" { 329 if err = lom.IncVersion(); err != nil { 330 nlog.Errorln(err) 331 } 332 } 333 } 334 } 335 336 // done 337 if err = lom.RenameFrom(poi.workFQN); err != nil { 338 return 339 } 340 if lom.HasCopies() { 341 if errdc := lom.DelAllCopies(); errdc != nil { 342 nlog.Errorf("PUT (%s): failed to delete old copies [%v], proceeding anyway...", poi.loghdr(), errdc) 343 } 344 } 345 if lom.AtimeUnix() == 0 { // (is set when migrating within cluster; prefetch special case) 346 lom.SetAtimeUnix(poi.atime) 347 } 348 err = lom.PersistMain() 349 return 350 } 351 352 // via backend.PutObj() 353 func (poi *putOI) putRemote() (ecode int, err error) { 354 var ( 355 lom = poi.lom 356 backend = poi.t.Backend(lom.Bck()) 357 ) 358 lmfh, err := cos.NewFileHandle(poi.workFQN) 359 if err != nil { 360 err = cmn.NewErrFailedTo(poi.t, "open", poi.workFQN, err) 361 return 362 } 363 if poi.owt == cmn.OwtPut && !lom.Bck().IsRemoteAIS() { 364 // some/all of those are set by the backend.PutObj() 365 lom.ObjAttrs().DelCustomKeys(cmn.SourceObjMD, cmn.CRC32CObjMD, cmn.ETag, cmn.MD5ObjMD, cmn.VersionObjMD) 366 } 367 368 ecode, err = backend.PutObj(lmfh, lom, poi.oreq) 369 if err == nil && !lom.Bck().IsRemoteAIS() { 370 lom.SetCustomKey(cmn.SourceObjMD, backend.Provider()) 371 } 372 return 373 } 374 375 // LOM is updated at the end of this call with size and checksum. 376 // `poi.r` (reader) is also closed upon exit. 377 func (poi *putOI) write() (buf []byte, slab *memsys.Slab, lmfh *os.File, err error) { 378 var ( 379 written int64 380 cksums = struct { 381 store *cos.CksumHash // store with LOM 382 expct *cos.Cksum // caller-provided (aka "end-to-end protection") 383 compt *cos.CksumHash // compute to validate `expct` - iff provided 384 finalized bool // to avoid computing the same checksum type twice 385 }{} 386 ckconf = poi.lom.CksumConf() 387 ) 388 if lmfh, err = poi.lom.CreateFile(poi.workFQN); err != nil { 389 return 390 } 391 if poi.size <= 0 { 392 buf, slab = poi.t.gmm.Alloc() 393 } else { 394 buf, slab = poi.t.gmm.AllocSize(poi.size) 395 } 396 397 switch { 398 case ckconf.Type == cos.ChecksumNone: 399 poi.lom.SetCksum(cos.NoneCksum) 400 // not using `ReadFrom` of the `*os.File` - 401 // ultimately, https://github.com/golang/go/blob/master/src/internal/poll/copy_file_range_linux.go#L100 402 written, err = cos.CopyBuffer(lmfh, poi.r, buf) 403 case !poi.cksumToUse.IsEmpty() && !poi.validateCksum(ckconf): 404 // if the corresponding validation is not configured/enabled we just go ahead 405 // and use the checksum that has arrived with the object 406 poi.lom.SetCksum(poi.cksumToUse) 407 // (ditto) 408 written, err = cos.CopyBuffer(lmfh, poi.r, buf) 409 default: 410 writers := make([]io.Writer, 0, 3) 411 cksums.store = cos.NewCksumHash(ckconf.Type) // always according to the bucket 412 writers = append(writers, cksums.store.H) 413 if !poi.skipVC && !poi.cksumToUse.IsEmpty() && poi.validateCksum(ckconf) { 414 cksums.expct = poi.cksumToUse 415 if poi.cksumToUse.Type() == cksums.store.Type() { 416 cksums.compt = cksums.store 417 } else { 418 // otherwise, compute separately 419 cksums.compt = cos.NewCksumHash(poi.cksumToUse.Type()) 420 writers = append(writers, cksums.compt.H) 421 } 422 } 423 writers = append(writers, lmfh) 424 written, err = cos.CopyBuffer(cos.NewWriterMulti(writers...), poi.r, buf) // (ditto) 425 } 426 if err != nil { 427 return 428 } 429 430 // validate 431 if cksums.compt != nil { 432 cksums.finalized = cksums.compt == cksums.store 433 cksums.compt.Finalize() 434 if !cksums.compt.Equal(cksums.expct) { 435 err = cos.NewErrDataCksum(cksums.expct, &cksums.compt.Cksum, poi.lom.String()) 436 poi.t.statsT.AddMany( 437 cos.NamedVal64{Name: stats.ErrCksumCount, Value: 1}, 438 cos.NamedVal64{Name: stats.ErrCksumSize, Value: written}, 439 ) 440 return 441 } 442 } 443 444 // ok 445 if poi.lom.IsFeatureSet(feat.FsyncPUT) { 446 err = lmfh.Sync() // compare w/ cos.FlushClose 447 debug.AssertNoErr(err) 448 } 449 450 cos.Close(lmfh) 451 lmfh = nil 452 453 poi.lom.SetSize(written) // TODO: compare with non-zero lom.SizeBytes() that may have been set via oa.FromHeader() 454 if cksums.store != nil { 455 if !cksums.finalized { 456 cksums.store.Finalize() 457 } 458 poi.lom.SetCksum(&cksums.store.Cksum) 459 } 460 return 461 } 462 463 // post-write close & cleanup 464 func (poi *putOI) _cleanup(buf []byte, slab *memsys.Slab, lmfh *os.File, err error) { 465 if buf != nil { 466 slab.Free(buf) 467 } 468 if err == nil { 469 cos.Close(poi.r) 470 return // ok 471 } 472 473 // not ok 474 poi.r.Close() 475 if nerr := lmfh.Close(); nerr != nil { 476 nlog.Errorf(fmtNested, poi.t, err, "close", poi.workFQN, nerr) 477 } 478 if nerr := cos.RemoveFile(poi.workFQN); nerr != nil && !os.IsNotExist(nerr) { 479 nlog.Errorf(fmtNested, poi.t, err, "remove", poi.workFQN, nerr) 480 } 481 } 482 483 func (poi *putOI) validateCksum(c *cmn.CksumConf) (v bool) { 484 switch poi.owt { 485 case cmn.OwtRebalance, cmn.OwtCopy: 486 v = c.ValidateObjMove 487 case cmn.OwtPut: 488 v = true 489 case cmn.OwtGetTryLock, cmn.OwtGetLock, cmn.OwtGet: 490 v = c.ValidateColdGet 491 case cmn.OwtGetPrefetchLock: 492 default: 493 debug.Assert(false, poi.owt) 494 } 495 return 496 } 497 498 // 499 // GET(object) 500 // 501 502 func (goi *getOI) getObject() (ecode int, err error) { 503 debug.Assert(!goi.unlocked) 504 goi.lom.Lock(false) 505 ecode, err = goi.get() 506 if !goi.unlocked { 507 goi.lom.Unlock(false) 508 } 509 return ecode, err 510 } 511 512 // is under rlock 513 func (goi *getOI) get() (ecode int, err error) { 514 var ( 515 cs fs.CapStatus 516 doubleCheck bool 517 retried bool 518 cold bool 519 ) 520 do: 521 err = goi.lom.Load(true /*cache it*/, true /*locked*/) 522 if err != nil { 523 cold = cos.IsNotExist(err, 0) 524 if !cold { 525 return http.StatusInternalServerError, err 526 } 527 if goi.lom.IsFeatureSet(feat.DisableColdGET) && goi.lom.Bck().IsRemote() { 528 return http.StatusNotFound, fmt.Errorf("%w (cold GET disabled)", err) 529 } 530 cs = fs.Cap() 531 if cs.IsOOS() { 532 return http.StatusInsufficientStorage, cs.Err() 533 } 534 if errN := cmn.ValidateObjName(goi.lom.ObjName); errN != nil { 535 return 0, errN 536 } 537 } 538 539 switch { 540 case cold && goi.lom.Bck().IsAIS(): 541 // ais bucket with no backend - try recover 542 goi.lom.Unlock(false) 543 doubleCheck, ecode, err = goi.restoreFromAny(false /*skipLomRestore*/) 544 if doubleCheck && err != nil { 545 lom2 := core.AllocLOM(goi.lom.ObjName) 546 er2 := lom2.InitBck(goi.lom.Bucket()) 547 if er2 == nil { 548 er2 = lom2.Load(true /*cache it*/, false /*locked*/) 549 } 550 if er2 == nil { 551 core.FreeLOM(goi.lom) 552 goi.lom = lom2 553 err = nil 554 } else { 555 core.FreeLOM(lom2) 556 } 557 } 558 if err != nil { 559 goi.unlocked = true 560 return ecode, err 561 } 562 goi.lom.Lock(false) 563 if err = goi.lom.Load(true /*cache it*/, true /*locked*/); err != nil { 564 return 0, err 565 } 566 goto fin // ok, done 567 case cold: 568 // have remote backend - use it 569 case goi.latestVer: 570 // apc.QparamLatestVer or 'versioning.validate_warm_get' 571 res := goi.lom.CheckRemoteMD(true /* rlocked */, false /*synchronize*/, goi.req) 572 if res.Err != nil { 573 return res.ErrCode, res.Err 574 } 575 if !res.Eq { 576 cold, goi.verchanged = true, true 577 } 578 // TODO: utilize res.ObjAttrs 579 } 580 581 // validate checksums and recover (a.k.a. self-heal) if corrupted 582 if !cold && goi.lom.CksumConf().ValidateWarmGet { 583 cold, ecode, err = goi.validateRecover() 584 if err != nil { 585 if !cold { 586 nlog.Errorln(err) 587 return ecode, err 588 } 589 nlog.Errorf("%v - proceeding to cold-GET from %s", err, goi.lom.Bck()) 590 } 591 } 592 593 // cold-GET: upgrade rlock => wlock, call t.Backend.GetObjReader 594 if cold { 595 var ( 596 res core.GetReaderResult 597 ckconf = goi.lom.CksumConf() 598 loaded bool 599 ) 600 if cs.IsNil() { 601 cs = fs.Cap() 602 } 603 if cs.IsOOS() { 604 return http.StatusInsufficientStorage, cs.Err() 605 } 606 goi.lom.SetAtimeUnix(goi.atime) 607 608 if loaded, err = goi._coldLock(); err != nil { 609 return 0, err 610 } 611 if loaded { 612 goto fin 613 } 614 615 // zero-out prev. version custom metadata, if any 616 goi.lom.SetCustomMD(nil) 617 618 // get remote reader (compare w/ t.GetCold) 619 res = goi.t.Backend(goi.lom.Bck()).GetObjReader(goi.ctx, goi.lom, 0, 0) 620 if res.Err != nil { 621 goi.lom.Unlock(true) 622 goi.unlocked = true 623 if !cos.IsNotExist(res.Err, res.ErrCode) { 624 nlog.Infoln(ftcg+"(read)", goi.lom.Cname(), res.Err, res.ErrCode) 625 } 626 return res.ErrCode, res.Err 627 } 628 goi.cold = true 629 630 // 3 alternative ways to perform cold GET 631 if goi.dpq.arch.path == "" && goi.dpq.arch.regx == "" && 632 (ckconf.Type == cos.ChecksumNone || (!ckconf.ValidateColdGet && !ckconf.EnableReadRange)) { 633 if goi.ranges.Range == "" && goi.lom.IsFeatureSet(feat.StreamingColdGET) { 634 err = goi.coldStream(&res) 635 } else { 636 err = goi.coldReopen(&res) 637 } 638 goi.unlocked = true // always 639 return 0, err 640 } 641 // otherwise, regular path 642 ecode, err = goi._coldPut(&res) 643 if err != nil { 644 goi.unlocked = true 645 return ecode, err 646 } 647 // with remaining stats via goi.stats() 648 goi.t.statsT.AddMany( 649 cos.NamedVal64{Name: stats.GetColdCount, Value: 1}, 650 cos.NamedVal64{Name: stats.GetColdSize, Value: res.Size}, 651 cos.NamedVal64{Name: stats.GetColdRwLatency, Value: mono.SinceNano(goi.ltime)}, 652 ) 653 } 654 655 // read locally and stream back 656 fin: 657 ecode, err = goi.txfini() 658 if err == nil { 659 debug.Assert(ecode == 0, ecode) 660 return 0, nil 661 } 662 goi.lom.Uncache() 663 if goi.retry { 664 goi.retry = false 665 if !retried { 666 nlog.Warningf("GET %s: retrying...", goi.lom) 667 retried = true // only once 668 goto do 669 } 670 nlog.Warningf("GET %s: failed retrying %v(%d)", goi.lom, err, ecode) 671 } 672 return ecode, err 673 } 674 675 // upgrade rlock => wlock 676 // done early to prevent multiple cold-readers duplicating network/disk operation and overwriting each other 677 func (goi *getOI) _coldLock() (loaded bool, err error) { 678 var ( 679 t, lom = goi.t, goi.lom 680 now int64 681 ) 682 outer: 683 for lom.UpgradeLock() { 684 if erl := lom.Load(true /*cache it*/, true /*locked*/); erl == nil { 685 // nothing to do 686 // (lock was upgraded by another goroutine that had also performed PUT on our behalf) 687 return true, nil 688 } 689 switch { 690 case now == 0: 691 now = mono.NanoTime() 692 fallthrough 693 case mono.Since(now) < max(cmn.Rom.CplaneOperation(), 2*time.Second): 694 nlog.Errorln(t.String()+": failed to load", lom.String(), err, "- retrying...") 695 default: 696 err = cmn.NewErrBusy("object", lom.Cname()) 697 break outer 698 } 699 } 700 return 701 } 702 703 func (goi *getOI) _coldPut(res *core.GetReaderResult) (int, error) { 704 var ( 705 t, lom = goi.t, goi.lom 706 poi = allocPOI() 707 ) 708 { 709 poi.t = t 710 poi.lom = lom 711 poi.config = cmn.GCO.Get() 712 poi.r = res.R 713 poi.size = res.Size 714 poi.workFQN = fs.CSM.Gen(lom, fs.WorkfileType, fs.WorkfileColdget) 715 poi.atime = goi.atime 716 poi.owt = cmn.OwtGet 717 poi.cksumToUse = res.ExpCksum // expected checksum (to validate if the bucket's `validate_cold_get == true`) 718 poi.coldGET = true 719 } 720 code, err := poi.putObject() 721 freePOI(poi) 722 723 if err != nil { 724 lom.Unlock(true) 725 nlog.Infoln(ftcg+"(put)", lom.Cname(), err) 726 return code, err 727 } 728 729 // load, downgrade lock, inc stats 730 if err = lom.Load(true /*cache it*/, true /*locked*/); err != nil { 731 lom.Unlock(true) 732 err = fmt.Errorf("unexpected failure to load %s: %w", lom, err) // (unlikely) 733 nlog.Errorln(err) 734 return http.StatusInternalServerError, err 735 } 736 737 lom.DowngradeLock() 738 return 0, nil 739 } 740 741 // - validate checksums 742 // - if corrupted and IsAIS, try to recover from redundant replicas or EC slices 743 // - otherwise, rely on the remote backend for recovery (tradeoff; TODO: make it configurable) 744 func (goi *getOI) validateRecover() (coldGet bool, code int, err error) { 745 var ( 746 lom = goi.lom 747 retried bool 748 ) 749 validate: 750 err = lom.ValidateMetaChecksum() 751 if err == nil { 752 err = lom.ValidateContentChecksum() 753 } 754 if err == nil { 755 return 756 } 757 code = http.StatusInternalServerError 758 if _, ok := err.(*cos.ErrBadCksum); !ok { 759 return 760 } 761 if !lom.Bck().IsAIS() { 762 coldGet = true 763 return 764 } 765 766 nlog.Warningln(err) 767 redundant := lom.HasCopies() || lom.ECEnabled() 768 // 769 // return err if there's no redundancy OR already recovered once (and failed) 770 // 771 if retried || !redundant { 772 // 773 // TODO: mark `deleted` and postpone actual deletion 774 // 775 if erl := lom.Remove(true /*force through rlock*/); erl != nil { 776 nlog.Warningf("%s: failed to remove corrupted %s, err: %v", goi.t, lom, erl) 777 } 778 return 779 } 780 // 781 // try to recover from BAD CHECKSUM 782 // 783 cos.RemoveFile(lom.FQN) // TODO: ditto 784 785 if lom.HasCopies() { 786 retried = true 787 goi.lom.Unlock(false) 788 // lookup and restore the object from local replicas 789 restored := lom.RestoreToLocation() 790 goi.lom.Lock(false) 791 if restored { 792 nlog.Warningf("%s: recovered corrupted %s from local replica", goi.t, lom) 793 code = 0 794 goto validate 795 } 796 } 797 if lom.ECEnabled() { 798 retried = true 799 goi.lom.Unlock(false) 800 cos.RemoveFile(lom.FQN) 801 _, code, err = goi.restoreFromAny(true /*skipLomRestore*/) 802 goi.lom.Lock(false) 803 if err == nil { 804 nlog.Warningf("%s: recovered corrupted %s from EC slices", goi.t, lom) 805 code = 0 806 goto validate 807 } 808 } 809 810 // TODO: ditto 811 if erl := lom.Remove(true /*force through rlock*/); erl != nil { 812 nlog.Warningf("%s: failed to remove corrupted %s, err: %v", goi.t, lom, erl) 813 } 814 return 815 } 816 817 // attempt to restore an object from any/all of the below: 818 // 1) local copies (other FSes on this target) 819 // 2) other targets (when resilvering or rebalancing is running (aka GFN)) 820 // 3) other targets if the bucket erasure coded 821 // 4) Cloud 822 func (goi *getOI) restoreFromAny(skipLomRestore bool) (doubleCheck bool, ecode int, err error) { 823 var ( 824 tsi *meta.Snode 825 smap = goi.t.owner.smap.get() 826 ) 827 // NOTE: including targets 'in maintenance mode' 828 tsi, err = smap.HrwHash2Tall(goi.lom.Digest()) 829 if err != nil { 830 return 831 } 832 if !skipLomRestore { 833 // when resilvering: 834 // (whether or not resilvering is active depends on the context: mountpath events vs GET) 835 var ( 836 resMarked = xreg.GetResilverMarked() 837 running = resMarked.Xact != nil 838 gfnActive = goi.t.res.IsActive(3 /*interval-of-inactivity multiplier*/) 839 ) 840 if resMarked.Interrupted || running || gfnActive { 841 if goi.lom.RestoreToLocation() { // from copies 842 nlog.Infof("%s restored to location", goi.lom) 843 return 844 } 845 doubleCheck = running 846 } 847 } 848 849 // when rebalancing: cluster-wide lookup (aka "get from neighbor" or GFN) 850 var ( 851 gfnNode *meta.Snode 852 marked = xreg.GetRebMarked() 853 running = marked.Xact != nil 854 gfnActive = reb.IsGFN() // GFN(global rebalance) 855 ecEnabled = goi.lom.ECEnabled() 856 // TODO: when not enough EC targets to restore a sliced object, 857 // we might still be able to restore from the object's full replica 858 enoughECRestoreTargets = goi.lom.Bprops().EC.RequiredRestoreTargets() <= smap.CountActiveTs() 859 ) 860 if running { 861 doubleCheck = true 862 } 863 if running && tsi.ID() != goi.t.SID() { 864 if goi.t.headt2t(goi.lom, tsi, smap) { 865 gfnNode = tsi 866 goto gfn 867 } 868 } 869 if running || !enoughECRestoreTargets || 870 ((marked.Interrupted || marked.Restarted || gfnActive) && !ecEnabled) { 871 gfnNode = goi.t.headObjBcast(goi.lom, smap) 872 } 873 gfn: 874 if gfnNode != nil { 875 if goi.getFromNeighbor(goi.lom, gfnNode) { 876 return 877 } 878 } 879 880 // restore from existing EC slices, if possible 881 ecErr := ec.ECM.RestoreObject(goi.lom) 882 if ecErr == nil { 883 ecErr = goi.lom.Load(true /*cache it*/, false /*locked*/) // TODO: optimize locking 884 debug.AssertNoErr(ecErr) 885 if ecErr == nil { 886 nlog.Infoln(goi.t.String(), "EC-recovered", goi.lom.String()) 887 return 888 } 889 err = cmn.NewErrFailedTo(goi.t, "load EC-recovered", goi.lom.Cname(), ecErr) 890 } else if ecErr != ec.ErrorECDisabled { 891 err = cmn.NewErrFailedTo(goi.t, "EC-recover", goi.lom.Cname(), ecErr) 892 if cmn.IsErrCapExceeded(ecErr) { 893 ecode = http.StatusInsufficientStorage 894 } 895 return 896 } 897 898 if err != nil { 899 err = cmn.NewErrFailedTo(goi.t, "goi-restore-any", goi.lom.Cname(), err) 900 } else { 901 err = cos.NewErrNotFound(goi.t, goi.lom.Cname()) 902 } 903 ecode = http.StatusNotFound 904 return 905 } 906 907 func (goi *getOI) getFromNeighbor(lom *core.LOM, tsi *meta.Snode) bool { 908 query := lom.Bck().NewQuery() 909 query.Set(apc.QparamIsGFNRequest, "true") 910 reqArgs := cmn.AllocHra() 911 { 912 reqArgs.Method = http.MethodGet 913 reqArgs.Base = tsi.URL(cmn.NetIntraData) 914 reqArgs.Header = http.Header{ 915 apc.HdrCallerID: []string{goi.t.SID()}, 916 apc.HdrCallerName: []string{goi.t.callerName()}, 917 } 918 reqArgs.Path = apc.URLPathObjects.Join(lom.Bck().Name, lom.ObjName) 919 reqArgs.Query = query 920 } 921 config := cmn.GCO.Get() 922 req, _, cancel, err := reqArgs.ReqWithTimeout(config.Timeout.SendFile.D()) 923 if err != nil { 924 debug.AssertNoErr(err) 925 return false 926 } 927 defer cancel() 928 929 resp, err := g.client.data.Do(req) //nolint:bodyclose // closed by `poi.putObject` 930 cmn.FreeHra(reqArgs) 931 if err != nil { 932 nlog.Errorf("%s: gfn failure, %s %q, err: %v", goi.t, tsi, lom, err) 933 return false 934 } 935 936 cksumToUse := lom.ObjAttrs().FromHeader(resp.Header) 937 workFQN := fs.CSM.Gen(lom, fs.WorkfileType, fs.WorkfileRemote) 938 poi := allocPOI() 939 { 940 poi.t = goi.t 941 poi.lom = lom 942 poi.config = config 943 poi.r = resp.Body 944 poi.owt = cmn.OwtRebalance 945 poi.workFQN = workFQN 946 poi.atime = lom.ObjAttrs().Atime 947 poi.cksumToUse = cksumToUse 948 } 949 ecode, erp := poi.putObject() 950 freePOI(poi) 951 if erp == nil { 952 if cmn.Rom.FastV(5, cos.SmoduleAIS) { 953 nlog.Infof("%s: gfn %s <= %s", goi.t, goi.lom, tsi) 954 } 955 return true 956 } 957 nlog.Errorf("%s: gfn-GET failed to PUT locally: %v(%d)", goi.t, erp, ecode) 958 return false 959 } 960 961 func (goi *getOI) txfini() (ecode int, err error) { 962 var ( 963 lmfh *os.File 964 hrng *htrange 965 fqn = goi.lom.FQN 966 dpq = goi.dpq 967 ) 968 if !goi.cold && !dpq.isGFN { 969 fqn = goi.lom.LBGet() // best-effort GET load balancing (see also mirror.findLeastUtilized()) 970 } 971 // open 972 lmfh, err = os.Open(fqn) 973 if err != nil { 974 if os.IsNotExist(err) { 975 ecode = http.StatusNotFound 976 goi.retry = true // (!lom.IsAIS() || lom.ECEnabled() || GFN...) 977 } else { 978 goi.t.fsErr(err, fqn) 979 ecode = http.StatusInternalServerError 980 err = cmn.NewErrFailedTo(goi.t, "goi-finalize", goi.lom.Cname(), err, ecode) 981 } 982 return ecode, err 983 } 984 985 whdr := goi.w.Header() 986 987 // transmit (range, arch, regular) 988 switch { 989 case goi.ranges.Range != "": 990 debug.Assert(!dpq.isArch()) 991 rsize := goi.lom.SizeBytes() 992 if goi.ranges.Size > 0 { 993 rsize = goi.ranges.Size 994 } 995 if hrng, ecode, err = goi.rngToHeader(whdr, rsize); err != nil { 996 break 997 } 998 err = goi._txrng(fqn, lmfh, whdr, hrng) 999 case dpq.isArch(): 1000 err = goi._txarch(fqn, lmfh, whdr) 1001 default: 1002 err = goi._txreg(fqn, lmfh, whdr) 1003 } 1004 1005 cos.Close(lmfh) 1006 return ecode, err 1007 } 1008 1009 func (goi *getOI) _txrng(fqn string, lmfh *os.File, whdr http.Header, hrng *htrange) (err error) { 1010 var ( 1011 r io.Reader 1012 lom = goi.lom 1013 sgl *memsys.SGL 1014 cksum = lom.Checksum() 1015 size int64 1016 ) 1017 ckconf := lom.CksumConf() 1018 cksumRange := ckconf.Type != cos.ChecksumNone && ckconf.EnableReadRange 1019 size = hrng.Length 1020 r = io.NewSectionReader(lmfh, hrng.Start, hrng.Length) 1021 if cksumRange { 1022 sgl = goi.t.gmm.NewSGL(size) 1023 _, cksumH, err := cos.CopyAndChecksum(sgl /*as ReaderFrom*/, r, nil, ckconf.Type) 1024 if err != nil { 1025 sgl.Free() 1026 return err 1027 } 1028 r = sgl 1029 if cksumH != nil { 1030 cksum = &cksumH.Cksum 1031 } 1032 } 1033 1034 // set response header 1035 whdr.Set(cos.HdrContentType, cos.ContentBinary) 1036 cmn.ToHeader(lom.ObjAttrs(), whdr, size, cksum) 1037 1038 buf, slab := goi.t.gmm.AllocSize(min(size, memsys.DefaultBuf2Size)) 1039 err = goi.transmit(r, buf, fqn) 1040 slab.Free(buf) 1041 if sgl != nil { 1042 sgl.Free() 1043 } 1044 return err 1045 } 1046 1047 // in particular, setup reader and writer and set headers 1048 func (goi *getOI) _txreg(fqn string, lmfh *os.File, whdr http.Header) (err error) { 1049 var ( 1050 dpq = goi.dpq 1051 lom = goi.lom 1052 cksum = lom.Checksum() 1053 size = lom.SizeBytes() 1054 ) 1055 // set response header 1056 whdr.Set(cos.HdrContentType, cos.ContentBinary) 1057 cmn.ToHeader(lom.ObjAttrs(), whdr, size, cksum) 1058 if dpq.isS3 { 1059 // (expecting user to set bucket checksum = md5) 1060 s3.SetEtag(whdr, lom) 1061 } 1062 1063 buf, slab := goi.t.gmm.AllocSize(min(size, memsys.DefaultBuf2Size)) 1064 err = goi.transmit(lmfh, buf, fqn) 1065 slab.Free(buf) 1066 return err 1067 } 1068 1069 // TODO: checksum 1070 func (goi *getOI) _txarch(fqn string, lmfh *os.File, whdr http.Header) error { 1071 var ( 1072 ar archive.Reader 1073 dpq = goi.dpq 1074 lom = goi.lom 1075 ) 1076 mime, err := archive.MimeFile(lmfh, goi.t.smm, dpq.arch.mime, lom.ObjName) 1077 if err != nil { 1078 return err 1079 } 1080 ar, err = archive.NewReader(mime, lmfh, lom.SizeBytes()) 1081 if err != nil { 1082 return fmt.Errorf("failed to open %s: %w", lom.Cname(), err) 1083 } 1084 1085 // single 1086 if dpq.arch.path != "" { 1087 debug.Assert(dpq.arch.mmode == "", dpq.arch.mmode) 1088 var csl cos.ReadCloseSizer 1089 csl, err = ar.ReadOne(dpq.arch.path) 1090 if err != nil { 1091 return cmn.NewErrFailedTo(goi.t, "extract "+dpq._archstr()+" from", lom.Cname(), err) 1092 } 1093 if csl == nil { 1094 return cos.NewErrNotFound(goi.t, dpq._archstr()+" in "+lom.Cname()) 1095 } 1096 // found 1097 whdr.Set(cos.HdrContentType, cos.ContentBinary) 1098 buf, slab := goi.t.gmm.AllocSize(min(csl.Size(), memsys.DefaultBuf2Size)) 1099 err = goi.transmit(csl, buf, fqn) 1100 slab.Free(buf) 1101 csl.Close() 1102 return err 1103 } 1104 1105 // multi match; writing & streaming tar =>(directly)=> response writer 1106 debug.Assert(dpq.arch.mmode != "") 1107 rcb := _newRcb(goi.w) 1108 whdr.Set(cos.HdrContentType, cos.ContentTar) 1109 err = ar.ReadUntil(rcb, dpq.arch.regx, dpq.arch.mmode) 1110 if err != nil { 1111 err = cmn.NewErrFailedTo(goi.t, "extract files that match "+dpq._archstr()+" from", lom.Cname(), err) 1112 } 1113 if err == nil && rcb.num == 0 { 1114 // none found 1115 return cos.NewErrNotFound(goi.t, dpq._archstr()+" in "+lom.Cname()) 1116 } 1117 rcb.fini() 1118 return err 1119 } 1120 1121 func (goi *getOI) transmit(r io.Reader, buf []byte, fqn string) error { 1122 written, err := cos.CopyBuffer(goi.w, r, buf) 1123 if err != nil { 1124 if !cos.IsRetriableConnErr(err) { 1125 goi.t.fsErr(err, fqn) 1126 } 1127 nlog.Errorln(cmn.NewErrFailedTo(goi.t, "GET", fqn, err)) 1128 // at this point, error is already written into the response - 1129 // return special code to indicate just that 1130 return errSendingResp 1131 } 1132 // Update objects sent during GFN. Thanks to this we will not 1133 // have to resend them in rebalance. In case of a race between rebalance 1134 // and GFN the former wins, resulting in duplicated transmission. 1135 if goi.dpq.isGFN { 1136 goi.t.reb.FilterAdd(cos.UnsafeB(goi.lom.Uname())) 1137 } else if !goi.cold { // GFN & cold-GET: must be already loaded w/ atime set 1138 if err := goi.lom.Load(false /*cache it*/, true /*locked*/); err != nil { 1139 nlog.Errorf("%s: GET post-transmission failure: %v", goi.t, err) 1140 return errSendingResp 1141 } 1142 goi.lom.SetAtimeUnix(goi.atime) 1143 goi.lom.Recache() 1144 } 1145 // 1146 // stats 1147 // 1148 goi.stats(written) 1149 return nil 1150 } 1151 1152 func (goi *getOI) stats(written int64) { 1153 goi.t.statsT.AddMany( 1154 cos.NamedVal64{Name: stats.GetCount, Value: 1}, 1155 cos.NamedVal64{Name: stats.GetSize, Value: written}, 1156 cos.NamedVal64{Name: stats.GetThroughput, Value: written}, // vis-à-vis user (as written m.b. range) 1157 cos.NamedVal64{Name: stats.GetLatency, Value: mono.SinceNano(goi.ltime)}, // see also: stats.GetColdRwLatency 1158 ) 1159 if goi.verchanged { 1160 goi.t.statsT.AddMany( 1161 cos.NamedVal64{Name: stats.VerChangeCount, Value: 1}, 1162 cos.NamedVal64{Name: stats.VerChangeSize, Value: goi.lom.SizeBytes()}, 1163 ) 1164 } 1165 } 1166 1167 // - parse and validate user specified read range (goi.ranges) 1168 // - set response header accordingly 1169 func (goi *getOI) rngToHeader(resphdr http.Header, size int64) (hrng *htrange, ecode int, err error) { 1170 var ranges []htrange 1171 ranges, err = parseMultiRange(goi.ranges.Range, size) 1172 if err != nil { 1173 if cmn.IsErrRangeNotSatisfiable(err) { 1174 // https://datatracker.ietf.org/doc/html/rfc7233#section-4.2 1175 resphdr.Set(cos.HdrContentRange, fmt.Sprintf("%s*/%d", cos.HdrContentRangeValPrefix, size)) 1176 } 1177 ecode = http.StatusRequestedRangeNotSatisfiable 1178 return 1179 } 1180 if len(ranges) == 0 { 1181 return 1182 } 1183 if len(ranges) > 1 { 1184 err = cmn.NewErrUnsupp("multi-range read", goi.lom.Cname()) 1185 ecode = http.StatusRequestedRangeNotSatisfiable 1186 return 1187 } 1188 if goi.dpq.arch.path != "" { 1189 err = cmn.NewErrUnsupp("range-read archived file", goi.dpq.arch.path) 1190 ecode = http.StatusRequestedRangeNotSatisfiable 1191 return 1192 } 1193 1194 // set response header 1195 hrng = &ranges[0] 1196 resphdr.Set(cos.HdrAcceptRanges, "bytes") 1197 resphdr.Set(cos.HdrContentRange, hrng.contentRange(size)) 1198 return 1199 } 1200 1201 // 1202 // APPEND a file or multiple files: 1203 // - as a new object, if doesn't exist 1204 // - to an existing object, if exists 1205 // 1206 1207 func (a *apndOI) do(r *http.Request) (packedHdl string, ecode int, err error) { 1208 var ( 1209 cksumValue = r.Header.Get(apc.HdrObjCksumVal) 1210 cksumType = r.Header.Get(apc.HdrObjCksumType) 1211 contentLength = r.Header.Get(cos.HdrContentLength) 1212 ) 1213 if contentLength != "" { 1214 if size, ers := strconv.ParseInt(contentLength, 10, 64); ers == nil { 1215 a.size = size 1216 } 1217 } 1218 if cksumValue != "" { 1219 a.cksum = cos.NewCksum(cksumType, cksumValue) 1220 } 1221 1222 switch a.op { 1223 case apc.AppendOp: 1224 buf, slab := a.t.gmm.Alloc() 1225 packedHdl, ecode, err = a.apnd(buf) 1226 slab.Free(buf) 1227 case apc.FlushOp: 1228 ecode, err = a.flush() 1229 default: 1230 err = fmt.Errorf("invalid operation %q (expecting either %q or %q) - check %q query", 1231 a.op, apc.AppendOp, apc.FlushOp, apc.QparamAppendType) 1232 } 1233 1234 return packedHdl, ecode, err 1235 } 1236 1237 func (a *apndOI) apnd(buf []byte) (packedHdl string, ecode int, err error) { 1238 var ( 1239 fh *os.File 1240 workFQN = a.hdl.workFQN 1241 ) 1242 if workFQN == "" { 1243 workFQN = fs.CSM.Gen(a.lom, fs.WorkfileType, fs.WorkfileAppend) 1244 a.lom.Lock(false) 1245 if a.lom.Load(false /*cache it*/, false /*locked*/) == nil { 1246 _, a.hdl.partialCksum, err = cos.CopyFile(a.lom.FQN, workFQN, buf, a.lom.CksumType()) 1247 a.lom.Unlock(false) 1248 if err != nil { 1249 ecode = http.StatusInternalServerError 1250 return 1251 } 1252 fh, err = os.OpenFile(workFQN, os.O_APPEND|os.O_WRONLY, cos.PermRWR) 1253 } else { 1254 a.lom.Unlock(false) 1255 a.hdl.partialCksum = cos.NewCksumHash(a.lom.CksumType()) 1256 fh, err = a.lom.CreateFile(workFQN) 1257 } 1258 } else { 1259 fh, err = os.OpenFile(workFQN, os.O_APPEND|os.O_WRONLY, cos.PermRWR) 1260 debug.Assert(a.hdl.partialCksum != nil) 1261 } 1262 if err != nil { // failed to open or create 1263 ecode = http.StatusInternalServerError 1264 return 1265 } 1266 1267 w := cos.NewWriterMulti(fh, a.hdl.partialCksum.H) 1268 _, err = cos.CopyBuffer(w, a.r, buf) 1269 cos.Close(fh) 1270 if err != nil { 1271 ecode = http.StatusInternalServerError 1272 return 1273 } 1274 1275 packedHdl = a.pack(workFQN) 1276 1277 // stats (TODO: add `stats.FlushCount` for symmetry) 1278 lat := time.Now().UnixNano() - a.started 1279 a.t.statsT.AddMany( 1280 cos.NamedVal64{Name: stats.AppendCount, Value: 1}, 1281 cos.NamedVal64{Name: stats.AppendLatency, Value: lat}, 1282 ) 1283 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 1284 nlog.Infof("APPEND %s: %s", a.lom, lat) 1285 } 1286 return 1287 } 1288 1289 func (a *apndOI) flush() (int, error) { 1290 if a.hdl.workFQN == "" { 1291 return 0, fmt.Errorf("failed to finalize append-file operation: empty source in the %+v handle", a.hdl) 1292 } 1293 1294 // finalize checksum 1295 debug.Assert(a.hdl.partialCksum != nil) 1296 a.hdl.partialCksum.Finalize() 1297 partialCksum := a.hdl.partialCksum.Clone() 1298 if !a.cksum.IsEmpty() && !partialCksum.Equal(a.cksum) { 1299 return http.StatusInternalServerError, cos.NewErrDataCksum(partialCksum, a.cksum) 1300 } 1301 1302 params := core.PromoteParams{ 1303 Bck: a.lom.Bck(), 1304 Cksum: partialCksum, 1305 Config: a.config, 1306 PromoteArgs: apc.PromoteArgs{ 1307 SrcFQN: a.hdl.workFQN, 1308 ObjName: a.lom.ObjName, 1309 OverwriteDst: true, 1310 DeleteSrc: true, // NOTE: always overwrite and remove 1311 }, 1312 } 1313 return a.t.Promote(¶ms) 1314 } 1315 1316 func (a *apndOI) parse(packedHdl string) error { 1317 if packedHdl == "" { 1318 return nil 1319 } 1320 items, err := preParse(packedHdl) 1321 if err != nil { 1322 return err 1323 } 1324 a.hdl.partialCksum = cos.NewCksumHash(items[2]) 1325 buf, err := base64.StdEncoding.DecodeString(items[3]) 1326 if err != nil { 1327 return err 1328 } 1329 if err := a.hdl.partialCksum.H.(encoding.BinaryUnmarshaler).UnmarshalBinary(buf); err != nil { 1330 return err 1331 } 1332 1333 a.hdl.nodeID = items[0] 1334 a.hdl.workFQN = items[1] 1335 return nil 1336 } 1337 1338 func (a *apndOI) pack(workFQN string) string { 1339 buf, err := a.hdl.partialCksum.H.(encoding.BinaryMarshaler).MarshalBinary() 1340 debug.AssertNoErr(err) 1341 cksumTy := a.hdl.partialCksum.Type() 1342 cksumBinary := base64.StdEncoding.EncodeToString(buf) 1343 return a.t.SID() + appendHandleSepa + workFQN + appendHandleSepa + cksumTy + appendHandleSepa + cksumBinary 1344 } 1345 1346 // 1347 // COPY (object | reader) 1348 // 1349 1350 // main method 1351 func (coi *copyOI) do(t *target, dm *bundle.DataMover, lom *core.LOM) (size int64, err error) { 1352 if coi.DryRun { 1353 return coi._dryRun(lom, coi.ObjnameTo) 1354 } 1355 1356 // DP == nil: use default (no-op transform) if source bucket is remote 1357 if coi.DP == nil && lom.Bck().IsRemote() { 1358 coi.DP = &core.LDP{} 1359 } 1360 1361 // 1: dst location 1362 smap := t.owner.smap.Get() 1363 tsi, errN := smap.HrwName2T(coi.BckTo.MakeUname(coi.ObjnameTo)) 1364 if errN != nil { 1365 return 0, errN 1366 } 1367 if tsi.ID() != t.SID() { 1368 return coi.send(t, dm, lom, coi.ObjnameTo, tsi) 1369 } 1370 1371 // dst is this target 1372 // 2, 3: with transformation and without 1373 dst := core.AllocLOM(coi.ObjnameTo) 1374 if err := dst.InitBck(coi.BckTo.Bucket()); err != nil { 1375 core.FreeLOM(dst) 1376 return 0, err 1377 } 1378 if coi.DP != nil { 1379 var ecode int 1380 size, ecode, err = coi._reader(t, dm, lom, dst) 1381 debug.Assert(ecode != http.StatusNotFound || cos.IsNotExist(err, 0), err, ecode) 1382 } else { 1383 size, err = coi._regular(t, lom, dst) 1384 } 1385 core.FreeLOM(dst) 1386 1387 return size, err 1388 } 1389 1390 func (coi *copyOI) _dryRun(lom *core.LOM, objnameTo string) (size int64, err error) { 1391 if coi.DP == nil { 1392 if lom.Uname() != coi.BckTo.MakeUname(objnameTo) { 1393 size = lom.SizeBytes() 1394 } 1395 return size, nil 1396 } 1397 1398 // discard the reader and be done 1399 var reader io.ReadCloser 1400 if reader, _, err = coi.DP.Reader(lom, false, false); err != nil { 1401 return 0, err 1402 } 1403 size, err = io.Copy(io.Discard, reader) 1404 reader.Close() 1405 return size, err 1406 } 1407 1408 // PUT DP(lom) => dst 1409 // The DP reader is responsible for any read-locking of the source lom. 1410 // 1411 // NOTE: no assumpions are being made on whether the source lom is present in cluster. 1412 // (can be a "pure" metadata of a (non-existing) Cloud object; accordingly, DP's reader must 1413 // be able to hande cold get, warm get, etc.) 1414 // 1415 // If destination bucket is remote: 1416 // - create a local replica of the object on one of the targets, and 1417 // - PUT to the relevant backend 1418 // An option for _not_ storing the object _in_ the cluster would be a _feature_ that can be 1419 // further debated. 1420 func (coi *copyOI) _reader(t *target, dm *bundle.DataMover, lom, dst *core.LOM) (size int64, _ int, _ error) { 1421 reader, oah, errN := coi.DP.Reader(lom, coi.LatestVer, coi.Sync) 1422 if errN != nil { 1423 return 0, 0, errN 1424 } 1425 if lom.Bck().Equal(coi.BckTo, true, true) { 1426 dst.SetVersion(oah.Version()) 1427 } 1428 1429 poi := allocPOI() 1430 { 1431 poi.t = t 1432 poi.lom = dst 1433 poi.config = coi.Config 1434 poi.r = reader 1435 poi.owt = coi.OWT 1436 poi.xctn = coi.Xact // on behalf of 1437 poi.workFQN = fs.CSM.Gen(dst, fs.WorkfileType, "copy-dp") 1438 poi.atime = oah.AtimeUnix() 1439 poi.cksumToUse = oah.Checksum() 1440 } 1441 if dm != nil { 1442 poi.owt = dm.OWT() // (compare with _send) 1443 } 1444 ecode, err := poi.putObject() 1445 freePOI(poi) 1446 if err == nil { 1447 // xaction stats: inc locally processed (and see data mover for in and out objs) 1448 size = oah.SizeBytes() 1449 } 1450 return size, ecode, err 1451 } 1452 1453 func (coi *copyOI) _regular(t *target, lom, dst *core.LOM) (size int64, _ error) { 1454 if lom.FQN == dst.FQN { // resilvering with a single mountpath? 1455 return 1456 } 1457 lcopy := lom.Uname() == dst.Uname() // n-way copy 1458 lom.Lock(lcopy) 1459 defer lom.Unlock(lcopy) 1460 1461 if err := lom.Load(false /*cache it*/, true /*locked*/); err != nil { 1462 if !cos.IsNotExist(err, 0) { 1463 err = cmn.NewErrFailedTo(t, "coi-load", lom.Cname(), err) 1464 } 1465 return 0, err 1466 } 1467 1468 // w-lock the destination unless already locked (above) 1469 if !lcopy { 1470 dst.Lock(true) 1471 defer dst.Unlock(true) 1472 if err := dst.Load(false /*cache it*/, true /*locked*/); err == nil { 1473 if lom.EqCksum(dst.Checksum()) { 1474 return 0, nil 1475 } 1476 } else if cmn.IsErrBucketNought(err) { 1477 return 0, err 1478 } 1479 } 1480 dst2, err := lom.Copy2FQN(dst.FQN, coi.Buf) 1481 if err == nil { 1482 size = lom.SizeBytes() 1483 if coi.Finalize { 1484 t.putMirror(dst2) 1485 } 1486 } 1487 if dst2 != nil { 1488 core.FreeLOM(dst2) 1489 } 1490 return size, err 1491 } 1492 1493 // send object => designated target 1494 // * source is a LOM or a reader (that may be reading from remote) 1495 // * one of the two equivalent transmission mechanisms: PUT or transport Send 1496 func (coi *copyOI) send(t *target, dm *bundle.DataMover, lom *core.LOM, objNameTo string, tsi *meta.Snode) (size int64, err error) { 1497 debug.Assert(coi.OWT > 0) 1498 sargs := allocSnda() 1499 { 1500 sargs.objNameTo = objNameTo 1501 sargs.tsi = tsi 1502 sargs.dm = dm 1503 sargs.owt = coi.OWT 1504 } 1505 if dm != nil { 1506 sargs.owt = dm.OWT() // takes precedence 1507 } 1508 size, err = coi._send(t, lom, sargs) 1509 freeSnda(sargs) 1510 return 1511 } 1512 1513 func (coi *copyOI) _send(t *target, lom *core.LOM, sargs *sendArgs) (size int64, _ error) { 1514 debug.Assert(!coi.DryRun) 1515 if sargs.dm != nil { 1516 // clone the `lom` to use it in the async operation (free it via `_sendObjDM` callback) 1517 lom = lom.CloneMD(lom.FQN) 1518 } 1519 1520 switch { 1521 case coi.OWT == cmn.OwtPromote: 1522 // 1. promote 1523 debug.Assert(coi.DP == nil) 1524 debug.Assert(sargs.owt == cmn.OwtPromote) 1525 1526 fh, err := cos.NewFileHandle(lom.FQN) 1527 if err != nil { 1528 if os.IsNotExist(err) { 1529 return 0, nil 1530 } 1531 return 0, cmn.NewErrFailedTo(t, "open", lom.Cname(), err) 1532 } 1533 fi, err := fh.Stat() 1534 if err != nil { 1535 fh.Close() 1536 return 0, cmn.NewErrFailedTo(t, "fstat", lom.Cname(), err) 1537 } 1538 size = fi.Size() 1539 sargs.reader, sargs.objAttrs = fh, lom 1540 case coi.DP == nil: 1541 // 2. migrate/replicate lom 1542 1543 lom.Lock(false) 1544 if err := lom.Load(false /*cache it*/, true /*locked*/); err != nil { 1545 lom.Unlock(false) 1546 return 0, nil 1547 } 1548 reader, err := lom.NewDeferROC() 1549 if err != nil { 1550 return 0, err 1551 } 1552 size = lom.SizeBytes() 1553 sargs.reader, sargs.objAttrs = reader, lom 1554 default: 1555 // 3. DP transform (possibly, no-op) 1556 // If the object is not present call t.Backend.GetObjReader 1557 reader, oah, err := coi.DP.Reader(lom, coi.LatestVer, coi.Sync) 1558 if err != nil { 1559 return 1560 } 1561 // returns cos.ContentLengthUnknown (-1) if post-transform size is unknown 1562 size = oah.SizeBytes() 1563 sargs.reader, sargs.objAttrs = reader, oah 1564 } 1565 1566 // do 1567 var err error 1568 sargs.bckTo = coi.BckTo 1569 if sargs.dm != nil { 1570 err = coi._dm(lom /*for attrs*/, sargs) 1571 } else { 1572 err = coi.put(t, sargs) 1573 } 1574 return size, err 1575 } 1576 1577 // use data mover to transmit objects to other targets 1578 // (compare with coi.put()) 1579 func (coi *copyOI) _dm(lom *core.LOM, sargs *sendArgs) error { 1580 debug.Assert(sargs.dm.OWT() == sargs.owt) 1581 debug.Assert(sargs.dm.GetXact() == coi.Xact || sargs.dm.GetXact().ID() == coi.Xact.ID()) 1582 o := transport.AllocSend() 1583 hdr, oa := &o.Hdr, sargs.objAttrs 1584 { 1585 hdr.Bck.Copy(sargs.bckTo.Bucket()) 1586 hdr.ObjName = sargs.objNameTo 1587 hdr.ObjAttrs.CopyFrom(oa, false /*skip cksum*/) 1588 } 1589 o.Callback = func(_ *transport.ObjHdr, _ io.ReadCloser, _ any, _ error) { 1590 core.FreeLOM(lom) 1591 } 1592 return sargs.dm.Send(o, sargs.reader, sargs.tsi) 1593 } 1594 1595 // PUT(lom) => destination target (compare with coi.dm()) 1596 // always closes params.Reader, either explicitly or via Do() 1597 func (coi *copyOI) put(t *target, sargs *sendArgs) error { 1598 var ( 1599 hdr = make(http.Header, 8) 1600 query = sargs.bckTo.NewQuery() 1601 ) 1602 cmn.ToHeader(sargs.objAttrs, hdr, sargs.objAttrs.SizeBytes(true)) 1603 hdr.Set(apc.HdrT2TPutterID, t.SID()) 1604 query.Set(apc.QparamOWT, sargs.owt.ToS()) 1605 if coi.Xact != nil { 1606 query.Set(apc.QparamUUID, coi.Xact.ID()) 1607 } 1608 reqArgs := cmn.HreqArgs{ 1609 Method: http.MethodPut, 1610 Base: sargs.tsi.URL(cmn.NetIntraData), 1611 Path: apc.URLPathObjects.Join(sargs.bckTo.Name, sargs.objNameTo), 1612 Query: query, 1613 Header: hdr, 1614 BodyR: sargs.reader, 1615 } 1616 req, _, cancel, err := reqArgs.ReqWithTimeout(coi.Config.Timeout.SendFile.D()) 1617 if err != nil { 1618 cos.Close(sargs.reader) 1619 return fmt.Errorf("unexpected failure to create request, err: %w", err) 1620 } 1621 defer cancel() 1622 resp, err := g.client.data.Do(req) 1623 if err != nil { 1624 return cmn.NewErrFailedTo(t, "coi.put "+sargs.bckTo.Name+"/"+sargs.objNameTo, sargs.tsi, err) 1625 } 1626 cos.DrainReader(resp.Body) 1627 resp.Body.Close() 1628 return nil 1629 } 1630 1631 func (coi *copyOI) stats(size int64, err error) { 1632 if err == nil && coi.Xact != nil { 1633 coi.Xact.ObjsAdd(1, size) 1634 } 1635 } 1636 1637 // 1638 // PUT a new shard _or_ APPEND to an existing one (w/ read/write/list via cmn/archive) 1639 // 1640 1641 func (a *putA2I) do() (int, error) { 1642 if a.filename == "" { 1643 return 0, errors.New("archive path is not defined") 1644 } 1645 // standard library does not support appending to tgz, zip, and such; 1646 // for TAR there is an optimizing workaround not requiring a full copy 1647 if a.mime == archive.ExtTar && !a.put { 1648 var ( 1649 err error 1650 fh *os.File 1651 size int64 1652 tarFormat tar.Format 1653 workFQN = fs.CSM.Gen(a.lom, fs.WorkfileType, fs.WorkfileAppendToArch) 1654 ) 1655 if err = os.Rename(a.lom.FQN, workFQN); err != nil { 1656 return http.StatusInternalServerError, err 1657 } 1658 fh, tarFormat, err = archive.OpenTarSeekEnd(a.lom.Cname(), workFQN) 1659 if err != nil { 1660 if errV := a.lom.RenameFrom(workFQN); errV != nil { 1661 return http.StatusInternalServerError, errV 1662 } 1663 if err == archive.ErrTarIsEmpty { 1664 a.put = true 1665 goto cpap 1666 } 1667 return http.StatusInternalServerError, err 1668 } 1669 // do - fast 1670 if size, err = a.fast(fh, tarFormat); err == nil { 1671 // NOTE: checksum traded off 1672 if err = a.finalize(size, cos.NoneCksum, workFQN); err == nil { 1673 return http.StatusInternalServerError, nil // ok 1674 } 1675 } 1676 if errV := a.lom.RenameFrom(workFQN); errV != nil { 1677 nlog.Errorf(fmtNested, a.t, err, "append and rename back", workFQN, errV) 1678 } 1679 return http.StatusInternalServerError, err 1680 } 1681 1682 cpap: // copy + append 1683 var ( 1684 err error 1685 lmfh, wfh *os.File 1686 workFQN string 1687 cksum cos.CksumHashSize 1688 aw archive.Writer 1689 ) 1690 workFQN = fs.CSM.Gen(a.lom, fs.WorkfileType, fs.WorkfileAppendToArch) 1691 wfh, err = os.OpenFile(workFQN, os.O_CREATE|os.O_WRONLY, cos.PermRWR) 1692 if err != nil { 1693 return http.StatusInternalServerError, err 1694 } 1695 // currently, arch writers only use size and time but it may change 1696 oah := cos.SimpleOAH{Size: a.size, Atime: a.started} 1697 if a.put { 1698 // when append becomes PUT (TODO: checksum type) 1699 cksum.Init(cos.ChecksumXXHash) 1700 aw = archive.NewWriter(a.mime, wfh, &cksum, nil /*opts*/) 1701 err = aw.Write(a.filename, oah, a.r) 1702 aw.Fini() 1703 } else { 1704 // copy + append 1705 lmfh, err = a.lom.OpenFile() 1706 if err != nil { 1707 cos.Close(wfh) 1708 return http.StatusNotFound, err 1709 } 1710 cksum.Init(a.lom.CksumType()) 1711 aw = archive.NewWriter(a.mime, wfh, &cksum, nil) 1712 err = aw.Copy(lmfh, a.lom.SizeBytes()) 1713 if err == nil { 1714 err = aw.Write(a.filename, oah, a.r) 1715 } 1716 aw.Fini() // in that order 1717 cos.Close(lmfh) 1718 } 1719 1720 // finalize 1721 cos.Close(wfh) 1722 if err == nil { 1723 cksum.Finalize() 1724 err = a.finalize(cksum.Size, cksum.Clone(), workFQN) 1725 } else { 1726 cos.RemoveFile(workFQN) 1727 } 1728 return a.reterr(err) 1729 } 1730 1731 // TAR only - fast & direct 1732 func (a *putA2I) fast(rwfh *os.File, tarFormat tar.Format) (size int64, err error) { 1733 var ( 1734 buf, slab = a.t.gmm.AllocSize(a.size) 1735 tw = tar.NewWriter(rwfh) 1736 hdr = tar.Header{ 1737 Typeflag: tar.TypeReg, 1738 Name: a.filename, 1739 Size: a.size, 1740 ModTime: time.Unix(0, a.started), 1741 Mode: int64(cos.PermRWRR), 1742 Format: tarFormat, 1743 } 1744 ) 1745 tw.WriteHeader(&hdr) 1746 _, err = io.CopyBuffer(tw, a.r, buf) // append 1747 cos.Close(tw) 1748 if err == nil { 1749 size, err = rwfh.Seek(0, io.SeekCurrent) 1750 } 1751 slab.Free(buf) 1752 cos.Close(rwfh) 1753 return 1754 } 1755 1756 func (*putA2I) reterr(err error) (int, error) { 1757 ecode := http.StatusInternalServerError 1758 if cmn.IsErrCapExceeded(err) { 1759 ecode = http.StatusInsufficientStorage 1760 } 1761 return ecode, err 1762 } 1763 1764 func (a *putA2I) finalize(size int64, cksum *cos.Cksum, fqn string) error { 1765 debug.Func(func() { 1766 finfo, err := os.Stat(fqn) 1767 debug.AssertNoErr(err) 1768 debug.Assertf(finfo.Size() == size, "%d != %d", finfo.Size(), size) 1769 }) 1770 // done 1771 if err := a.lom.RenameFrom(fqn); err != nil { 1772 return err 1773 } 1774 a.lom.SetSize(size) 1775 a.lom.SetCksum(cksum) 1776 a.lom.SetAtimeUnix(a.started) 1777 if err := a.lom.Persist(); err != nil { 1778 return err 1779 } 1780 if a.lom.ECEnabled() { 1781 if err := ec.ECM.EncodeObject(a.lom, nil); err != nil && err != ec.ErrorECDisabled { 1782 return err 1783 } 1784 } 1785 a.t.putMirror(a.lom) 1786 return nil 1787 } 1788 1789 // 1790 // put mirorr (main) 1791 // 1792 1793 func (t *target) putMirror(lom *core.LOM) { 1794 mconfig := lom.MirrorConf() 1795 if !mconfig.Enabled { 1796 return 1797 } 1798 if mpathCnt := fs.NumAvail(); mpathCnt < int(mconfig.Copies) { 1799 t.statsT.IncErr(stats.ErrPutMirrorCount) 1800 nanotim := mono.NanoTime() 1801 if nanotim&0x7 == 7 { 1802 if mpathCnt == 0 { 1803 nlog.Errorf("%s: %v", t, cmn.ErrNoMountpaths) 1804 } else { 1805 nlog.Errorf(fmtErrInsuffMpaths2, t, mpathCnt, lom, mconfig.Copies) 1806 } 1807 } 1808 return 1809 } 1810 rns := xreg.RenewPutMirror(lom) 1811 if rns.Err != nil { 1812 nlog.Errorf("%s: %s %v", t, lom, rns.Err) 1813 debug.AssertNoErr(rns.Err) 1814 return 1815 } 1816 xctn := rns.Entry.Get() 1817 xputlrep := xctn.(*mirror.XactPut) 1818 xputlrep.Repl(lom) 1819 } 1820 1821 // TODO: 1822 // - CopyBuffer 1823 // - currently, only tar - add message pack (what else?) 1824 // - Call(..., *tar.Header) to avoid typecast 1825 1826 type rcbCtx struct { 1827 w io.Writer 1828 tw *tar.Writer 1829 num int 1830 } 1831 1832 var _ archive.ArchRCB = (*rcbCtx)(nil) 1833 1834 func _newRcb(w io.Writer) (c *rcbCtx) { 1835 c = &rcbCtx{w: w} 1836 return c 1837 } 1838 1839 func (c *rcbCtx) Call(_ string, reader cos.ReadCloseSizer, hdr any) (_ bool /*stop*/, err error) { 1840 if c.tw == nil { 1841 debug.Assert(c.num == 0) 1842 c.tw = tar.NewWriter(c.w) 1843 } 1844 c.num++ 1845 tarHdr, ok := hdr.(*tar.Header) 1846 debug.Assert(ok) 1847 if err = c.tw.WriteHeader(tarHdr); err == nil { 1848 _, err = io.Copy(c.tw, reader) 1849 } 1850 return false, err 1851 } 1852 1853 func (c *rcbCtx) fini() { 1854 if c.tw != nil { 1855 debug.Assert(c.num > 0) 1856 c.tw.Close() 1857 } 1858 } 1859 1860 // 1861 // mem pools 1862 // 1863 1864 var ( 1865 goiPool, poiPool, sndPool sync.Pool 1866 1867 goi0 getOI 1868 poi0 putOI 1869 snd0 sendArgs 1870 ) 1871 1872 func allocGOI() (a *getOI) { 1873 if v := goiPool.Get(); v != nil { 1874 a = v.(*getOI) 1875 return 1876 } 1877 return &getOI{} 1878 } 1879 1880 func freeGOI(a *getOI) { 1881 *a = goi0 1882 goiPool.Put(a) 1883 } 1884 1885 func allocPOI() (a *putOI) { 1886 if v := poiPool.Get(); v != nil { 1887 a = v.(*putOI) 1888 return 1889 } 1890 return &putOI{} 1891 } 1892 1893 func freePOI(a *putOI) { 1894 *a = poi0 1895 poiPool.Put(a) 1896 } 1897 1898 func allocSnda() (a *sendArgs) { 1899 if v := sndPool.Get(); v != nil { 1900 a = v.(*sendArgs) 1901 return 1902 } 1903 return &sendArgs{} 1904 } 1905 1906 func freeSnda(a *sendArgs) { 1907 *a = snd0 1908 sndPool.Put(a) 1909 }