github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/core/lom.go (about) 1 // Package core provides core metadata and in-cluster API 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package core 6 7 import ( 8 "fmt" 9 "io" 10 "os" 11 "runtime" 12 "strconv" 13 "strings" 14 "sync" 15 "time" 16 17 "github.com/NVIDIA/aistore/api/apc" 18 "github.com/NVIDIA/aistore/cmn" 19 "github.com/NVIDIA/aistore/cmn/atomic" 20 "github.com/NVIDIA/aistore/cmn/cos" 21 "github.com/NVIDIA/aistore/cmn/debug" 22 "github.com/NVIDIA/aistore/cmn/feat" 23 "github.com/NVIDIA/aistore/core/meta" 24 "github.com/NVIDIA/aistore/fs" 25 "github.com/NVIDIA/aistore/ios" 26 "github.com/NVIDIA/aistore/memsys" 27 "github.com/NVIDIA/aistore/transport" 28 ) 29 30 // Local Object Metadata (LOM) is a locally stored object metadata comprising, in part: 31 // - name, version, atime, checksum, size, etc. object attributes and flags 32 // - runtime context including properties and configuration of the bucket 33 // that contains this LOM 34 35 const ( 36 lomInitialVersion = "1" 37 ) 38 39 // core stats 40 const ( 41 RemoteDeletedDelCount = "remote.deleted.del.n" 42 43 // lcache stats 44 LcacheCollisionCount = "lcache.collision.n" 45 LcacheEvictedCount = "lcache.evicted.n" 46 LcacheFlushColdCount = "lcache.flush.cold.n" 47 ) 48 49 type ( 50 lmeta struct { 51 copies fs.MPI 52 uname string 53 cmn.ObjAttrs 54 atimefs uint64 // NOTE: high bit is reserved for `dirty` 55 bckID uint64 56 } 57 LOM struct { 58 mi *fs.Mountpath 59 bck meta.Bck 60 ObjName string 61 FQN string 62 HrwFQN string // (=> main replica) 63 md lmeta // on-disk metadata 64 digest uint64 // uname digest 65 } 66 ) 67 68 type ( 69 global struct { 70 tstats cos.StatsUpdater // (stats.Trunner) 71 pmm, smm *memsys.MMSA 72 maxLmeta atomic.Int64 73 locker nameLocker 74 lchk lchk 75 } 76 ) 77 78 var bckLocker nameLocker // common 79 80 // target only 81 var ( 82 T Target 83 g global 84 ) 85 86 // interface guard 87 var ( 88 _ cos.OAH = (*LOM)(nil) 89 _ fs.PartsFQN = (*LOM)(nil) 90 _ lifUnlocker = (*LOM)(nil) 91 ) 92 93 func Pinit() { bckLocker = newNameLocker() } 94 95 func Tinit(t Target, tstats cos.StatsUpdater, runHK bool) { 96 bckLocker = newNameLocker() 97 T = t 98 { 99 g.maxLmeta.Store(xattrMaxSize) 100 g.locker = newNameLocker() 101 g.tstats = tstats 102 g.pmm = t.PageMM() 103 g.smm = t.ByteMM() 104 } 105 if runHK { 106 regLomCacheWithHK() 107 } 108 } 109 110 func Term() { 111 const sleep = time.Second >> 2 // total <= 2s 112 for i := 0; i < 8 && !g.lchk.running.CAS(false, true); i++ { 113 time.Sleep(sleep) 114 } 115 g.lchk.evictAll(termDuration) 116 } 117 118 ///////// 119 // LOM // 120 ///////// 121 122 func (lom *LOM) ObjAttrs() *cmn.ObjAttrs { return &lom.md.ObjAttrs } 123 124 // LOM == remote-object equality check 125 func (lom *LOM) Equal(rem cos.OAH) bool { return lom.ObjAttrs().Equal(rem) } 126 127 func (lom *LOM) CopyAttrs(oah cos.OAH, skipCksum bool) { 128 lom.md.ObjAttrs.CopyFrom(oah, skipCksum) 129 } 130 131 // special a) when a new version is being created b) for usage in unit tests 132 func (lom *LOM) SizeBytes(special ...bool) int64 { 133 debug.Assert(len(special) > 0 || lom.loaded(), lom.String()) 134 return lom.md.Size 135 } 136 137 func (lom *LOM) Version(special ...bool) string { 138 debug.Assert(len(special) > 0 || lom.loaded()) 139 return lom.md.Ver 140 } 141 142 func (lom *LOM) Uname() string { return lom.md.uname } 143 func (lom *LOM) Digest() uint64 { return lom.digest } 144 145 func (lom *LOM) SetSize(size int64) { lom.md.Size = size } 146 func (lom *LOM) SetVersion(ver string) { lom.md.Ver = ver } 147 148 func (lom *LOM) Checksum() *cos.Cksum { return lom.md.Cksum } 149 func (lom *LOM) SetCksum(cksum *cos.Cksum) { lom.md.Cksum = cksum } 150 func (lom *LOM) EqCksum(cksum *cos.Cksum) bool { return lom.md.Cksum.Equal(cksum) } 151 152 func (lom *LOM) Atime() time.Time { return time.Unix(0, lom.md.Atime) } 153 func (lom *LOM) AtimeUnix() int64 { return lom.md.Atime } 154 func (lom *LOM) SetAtimeUnix(tu int64) { lom.md.Atime = tu } 155 156 // custom metadata 157 func (lom *LOM) GetCustomMD() cos.StrKVs { return lom.md.GetCustomMD() } 158 func (lom *LOM) SetCustomMD(md cos.StrKVs) { lom.md.SetCustomMD(md) } 159 160 func (lom *LOM) GetCustomKey(key string) (string, bool) { return lom.md.GetCustomKey(key) } 161 func (lom *LOM) SetCustomKey(key, value string) { lom.md.SetCustomKey(key, value) } 162 163 // lom <= transport.ObjHdr (NOTE: caller must call freeLOM) 164 func AllocLomFromHdr(hdr *transport.ObjHdr) (lom *LOM, err error) { 165 lom = AllocLOM(hdr.ObjName) 166 if err = lom.InitBck(&hdr.Bck); err != nil { 167 return 168 } 169 lom.CopyAttrs(&hdr.ObjAttrs, false /*skip checksum*/) 170 return 171 } 172 173 func (lom *LOM) IsHRW() bool { return lom.HrwFQN == lom.FQN } // subj to resilvering 174 175 func (lom *LOM) Bprops() *cmn.Bprops { return lom.bck.Props } 176 177 // bprops accessors for convenience 178 func (lom *LOM) ECEnabled() bool { return lom.Bprops().EC.Enabled } 179 func (lom *LOM) IsFeatureSet(f feat.Flags) bool { return lom.Bprops().Features.IsSet(f) } 180 func (lom *LOM) MirrorConf() *cmn.MirrorConf { return &lom.Bprops().Mirror } 181 func (lom *LOM) CksumConf() *cmn.CksumConf { return lom.bck.CksumConf() } 182 func (lom *LOM) CksumType() string { return lom.bck.CksumConf().Type } 183 func (lom *LOM) VersionConf() cmn.VersionConf { return lom.bck.VersionConf() } 184 185 // as fs.PartsFQN 186 func (lom *LOM) ObjectName() string { return lom.ObjName } 187 func (lom *LOM) Bck() *meta.Bck { return &lom.bck } 188 func (lom *LOM) Bucket() *cmn.Bck { return (*cmn.Bck)(&lom.bck) } 189 func (lom *LOM) Mountpath() *fs.Mountpath { return lom.mi } 190 func (lom *LOM) Location() string { return T.String() + apc.LocationPropSepa + lom.mi.String() } 191 192 func ParseObjLoc(loc string) (tname, mpname string) { 193 i := strings.IndexByte(loc, apc.LocationPropSepa[0]) 194 tname, mpname = loc[:i], loc[i+1:] 195 return 196 } 197 198 // see also: transport.ObjHdr.Cname() 199 func (lom *LOM) Cname() string { return lom.bck.Cname(lom.ObjName) } 200 201 func (lom *LOM) WritePolicy() (p apc.WritePolicy) { 202 if bprops := lom.Bprops(); bprops == nil { 203 p = apc.WriteImmediate 204 } else { 205 p = bprops.WritePolicy.MD 206 } 207 return 208 } 209 210 func (lom *LOM) loaded() bool { return lom.md.bckID != 0 } 211 212 func (lom *LOM) HrwTarget(smap *meta.Smap) (tsi *meta.Snode, local bool, err error) { 213 tsi, err = smap.HrwHash2T(lom.digest) 214 if err != nil { 215 return 216 } 217 local = tsi.ID() == T.SID() 218 return 219 } 220 221 func (lom *LOM) IncVersion() error { 222 debug.Assert(lom.Bck().IsAIS()) 223 if lom.md.Ver == "" { 224 lom.SetVersion(lomInitialVersion) 225 return nil 226 } 227 ver, err := strconv.Atoi(lom.md.Ver) 228 if err != nil { 229 return fmt.Errorf("%s: %v", lom, err) 230 } 231 lom.SetVersion(strconv.Itoa(ver + 1)) 232 return nil 233 } 234 235 // Returns stored checksum (if present) and computed checksum (if requested) 236 // MAY compute and store a missing (xxhash) checksum. 237 // If xattr checksum is different than lom's metadata checksum, returns error 238 // and do not recompute checksum even if recompute set to true. 239 // 240 // * objects are stored in the cluster with their content checksums and in accordance 241 // with their bucket configurations. 242 // * xxhash is the system-default checksum. 243 // * user can override the system default on a bucket level, by setting checksum=none. 244 // * bucket (re)configuration can be done at any time. 245 // * an object with a bad checksum cannot be retrieved (via GET) and cannot be replicated 246 // or migrated. 247 // * GET and PUT operations support an option to validate checksums. 248 // * validation is done against a checksum stored with an object (GET), or a checksum 249 // provided by a user (PUT). 250 // * replications and migrations are always protected by checksums. 251 // * when two objects in the cluster have identical (bucket, object) names and checksums, 252 // they are considered to be full replicas of each other. 253 // ============================================================================== 254 255 // ValidateMetaChecksum validates whether checksum stored in lom's in-memory metadata 256 // matches checksum stored on disk. 257 // Use lom.ValidateContentChecksum() to recompute and check object's content checksum. 258 func (lom *LOM) ValidateMetaChecksum() error { 259 var ( 260 md *lmeta 261 err error 262 ) 263 if lom.CksumType() == cos.ChecksumNone { 264 return nil 265 } 266 wmd := lom.WritePolicy() 267 if wmd == apc.WriteNever || (wmd == apc.WriteDelayed && lom.md.isDirty()) { 268 // cannot validate meta checksum 269 return nil 270 } 271 md, err = lom.lmfsReload(false) 272 if err != nil { 273 return err 274 } 275 if md == nil { 276 return fmt.Errorf("%s: no meta", lom) 277 } 278 if lom.md.Cksum == nil { 279 lom.SetCksum(md.Cksum) 280 return nil 281 } 282 // different versions may have different checksums 283 if md.Ver == lom.md.Ver && !lom.EqCksum(md.Cksum) { 284 err = cos.NewErrDataCksum(lom.md.Cksum, md.Cksum, lom.String()) 285 lom.Uncache() 286 } 287 return err 288 } 289 290 // ValidateDiskChecksum validates if checksum stored in lom's in-memory metadata 291 // matches object's content checksum. 292 // Use lom.ValidateMetaChecksum() to check lom's checksum vs on-disk metadata. 293 func (lom *LOM) ValidateContentChecksum() (err error) { 294 var ( 295 cksumType = lom.CksumType() 296 cksums = struct { 297 stor *cos.Cksum // stored with LOM 298 comp *cos.CksumHash // computed 299 }{stor: lom.md.Cksum} 300 reloaded bool 301 ) 302 recomp: 303 if cksumType == cos.ChecksumNone { // as far as do-no-checksum-checking bucket rules 304 return 305 } 306 if !lom.md.Cksum.IsEmpty() { 307 cksumType = lom.md.Cksum.Ty() // takes precedence on the other hand 308 } 309 if cksums.comp, err = lom.ComputeCksum(cksumType); err != nil { 310 return 311 } 312 if lom.md.Cksum.IsEmpty() { // store computed 313 lom.md.Cksum = cksums.comp.Clone() 314 if !lom.loaded() { 315 lom.SetAtimeUnix(time.Now().UnixNano()) 316 } 317 if err = lom.Persist(); err != nil { 318 lom.md.Cksum = cksums.stor 319 } 320 return 321 } 322 if cksums.comp.Equal(lom.md.Cksum) { 323 return 324 } 325 if reloaded { 326 goto ex 327 } 328 // retry: load from disk and check again 329 reloaded = true 330 if _, err = lom.lmfsReload(true); err == nil && lom.md.Cksum != nil { 331 // type changed - recompute 332 if cksumType != lom.md.Cksum.Ty() { 333 cksums.stor = lom.md.Cksum 334 cksumType = lom.CksumType() 335 goto recomp 336 } 337 // otherwise, check 338 if cksums.comp.Equal(lom.md.Cksum) { 339 return 340 } 341 } 342 ex: 343 err = cos.NewErrDataCksum(&cksums.comp.Cksum, cksums.stor, lom.String()) 344 lom.Uncache() 345 return 346 } 347 348 func (lom *LOM) ComputeSetCksum() (*cos.Cksum, error) { 349 var ( 350 cksum *cos.Cksum 351 cksumHash, err = lom.ComputeCksum(lom.CksumType()) 352 ) 353 if err != nil { 354 return nil, err 355 } 356 if cksumHash != nil { 357 cksum = cksumHash.Clone() 358 } 359 lom.SetCksum(cksum) 360 return cksum, nil 361 } 362 363 func (lom *LOM) ComputeCksum(cksumType string) (cksum *cos.CksumHash, err error) { 364 var file *os.File 365 if cksumType == cos.ChecksumNone { 366 return 367 } 368 if file, err = lom.OpenFile(); err != nil { 369 return 370 } 371 // No need to allocate `buf` as `io.Discard` has efficient `io.ReaderFrom` implementation. 372 _, cksum, err = cos.CopyAndChecksum(io.Discard, file, nil, cksumType) 373 cos.Close(file) 374 if err != nil { 375 return nil, err 376 } 377 return 378 } 379 380 // no lock is taken when locked by an immediate caller, or otherwise is known to be locked 381 // otherwise, try Rlock temporarily _if and only when_ reading from fs 382 // 383 // (compare w/ LoadUnsafe() below) 384 func (lom *LOM) Load(cacheit, locked bool) error { 385 var ( 386 lcache, lmd = lom.fromCache() 387 bmd = T.Bowner().Get() 388 ) 389 // fast path 390 if lmd != nil { 391 lom.md = *lmd 392 return lom._checkBucket(bmd) 393 } 394 395 // slow path 396 if !locked && lom.TryLock(false) { 397 defer lom.Unlock(false) 398 } 399 if err := lom.FromFS(); err != nil { 400 return err 401 } 402 bid := lom.Bprops().BID 403 debug.Assert(bid != 0, lom.Cname()) 404 if bid == 0 { 405 return nil 406 } 407 lom.md.bckID = bid 408 if err := lom._checkBucket(bmd); err != nil { 409 return err 410 } 411 if cacheit && lcache != nil { 412 md := lom.md 413 lcache.Store(lom.digest, &md) 414 } 415 return nil 416 } 417 418 func (lom *LOM) _checkBucket(bmd *meta.BMD) (err error) { 419 bck, bckID := &lom.bck, lom.md.bckID 420 debug.Assert(bckID != 0) 421 bprops, present := bmd.Get(bck) 422 if !present { 423 if bck.IsRemote() { 424 return cmn.NewErrRemoteBckNotFound(bck.Bucket()) 425 } 426 return cmn.NewErrBckNotFound(bck.Bucket()) 427 } 428 if bckID == bprops.BID { 429 return nil // ok 430 } 431 err = cmn.NewErrObjDefunct(lom.String(), lom.md.bckID, lom.bck.Props.BID) 432 return 433 } 434 435 // usage: fast (and unsafe) loading object metadata except atime - no locks 436 // compare with conventional Load() above 437 func (lom *LOM) LoadUnsafe() (err error) { 438 var ( 439 _, lmd = lom.fromCache() 440 bmd = T.Bowner().Get() 441 ) 442 // fast path 443 if lmd != nil { 444 lom.md = *lmd 445 err = lom._checkBucket(bmd) 446 return 447 } 448 // read and decode xattr; NOTE: fs.GetXattr* vs fs.SetXattr race possible and must be 449 // either a) handled or b) benign from the caller's perspective 450 if _, err = lom.lmfs(true); err != nil { 451 return 452 } 453 // check bucket 454 bid := lom.Bprops().BID 455 debug.Assert(bid != 0, lom.Cname()) 456 if bid == 0 { 457 return 458 } 459 lom.md.bckID = bid 460 return lom._checkBucket(bmd) 461 } 462 463 // 464 // lom cache ------------------------------------------------------------- 465 // 466 467 // store new or refresh existing 468 func (lom *LOM) Recache() { 469 debug.Assert(!lom.IsCopy()) 470 md := lom.md 471 bid := lom.Bprops().BID 472 debug.Assert(bid != 0) 473 md.bckID, lom.md.bckID = bid, bid 474 475 lcache := lom.lcache() 476 val, ok := lcache.Swap(lom.digest, &md) 477 if !ok { 478 return 479 } 480 lmd := val.(*lmeta) 481 if lmd.uname != lom.md.uname { 482 g.tstats.Inc(LcacheCollisionCount) // target stats 483 } else { 484 // updating the value that's already in the map (race extremely unlikely, benign anyway) 485 md.cpAtime(lmd) 486 } 487 } 488 489 func (lom *LOM) Uncache() { 490 lcache := lom.lcache() 491 md, ok := lcache.LoadAndDelete(lom.digest) 492 if !ok { 493 return 494 } 495 lmd := md.(*lmeta) 496 if lmd.uname != lom.md.uname { 497 g.tstats.Inc(LcacheCollisionCount) // target stats 498 } else { 499 lom.md.cpAtime(lmd) 500 } 501 } 502 503 // remove from cache unless dirty 504 func (lom *LOM) UncacheUnless() { 505 lcache, lmd := lom.fromCache() 506 if lmd == nil { 507 return 508 } 509 if !lmd.isDirty() { 510 lom.md.cpAtime(lmd) 511 lcache.Delete(lom.md.uname) 512 } 513 } 514 515 func (lom *LOM) CacheIdx() int { return fs.LcacheIdx(lom.digest) } // (lif.CacheIdx()) 516 func (lom *LOM) lcache() *sync.Map { return lom.mi.LomCache(lom.CacheIdx()) } 517 518 func (lom *LOM) fromCache() (lcache *sync.Map, lmd *lmeta) { 519 lcache = lom.lcache() 520 if md, ok := lcache.Load(lom.digest); ok { 521 lmd = md.(*lmeta) 522 if lmd.uname != lom.md.uname { 523 g.tstats.Inc(LcacheCollisionCount) // target stats 524 } 525 } 526 return 527 } 528 529 func (lom *LOM) FromFS() error { 530 finfo, atimefs, err := ios.FinfoAtime(lom.FQN) 531 if err != nil { 532 if !os.IsNotExist(err) { 533 err = os.NewSyscallError("stat", err) 534 T.FSHC(err, lom.FQN) 535 } 536 return err 537 } 538 if _, err = lom.lmfs(true); err != nil { 539 // retry once 540 if cmn.IsErrLmetaNotFound(err) { 541 runtime.Gosched() 542 _, err = lom.lmfs(true) 543 } 544 } 545 if err != nil { 546 if !cmn.IsErrLmetaNotFound(err) { 547 T.FSHC(err, lom.FQN) 548 } 549 return err 550 } 551 // fstat & atime 552 if lom.md.Size != finfo.Size() { // corruption or tampering 553 return cmn.NewErrLmetaCorrupted(lom.whingeSize(finfo.Size())) 554 } 555 lom.md.Atime = atimefs 556 lom.md.atimefs = uint64(atimefs) 557 return nil 558 } 559 560 func (lom *LOM) whingeSize(size int64) error { 561 return fmt.Errorf("errsize (%d != %d)", lom.md.Size, size) 562 } 563 564 func lomCaches() []*sync.Map { 565 var ( 566 i int 567 availablePaths = fs.GetAvail() 568 cachesCnt = len(availablePaths) * cos.MultiSyncMapCount 569 caches = make([]*sync.Map, cachesCnt) 570 ) 571 for _, mi := range availablePaths { 572 for idx := range cos.MultiSyncMapCount { 573 caches[i] = mi.LomCache(idx) 574 i++ 575 } 576 } 577 return caches 578 } 579 580 // 581 // lock/unlock ------------------------------------------ 582 // 583 584 func (lom *LOM) getLocker() *nlc { return &g.locker[lom.CacheIdx()] } // (lif.getLocker()) 585 586 func (lom *LOM) IsLocked() (int /*rc*/, bool /*exclusive*/) { 587 nlc := lom.getLocker() 588 return nlc.IsLocked(lom.Uname()) 589 } 590 591 func (lom *LOM) TryLock(exclusive bool) bool { 592 nlc := lom.getLocker() 593 return nlc.TryLock(lom.Uname(), exclusive) 594 } 595 596 func (lom *LOM) Lock(exclusive bool) { 597 nlc := lom.getLocker() 598 nlc.Lock(lom.Uname(), exclusive) 599 } 600 601 func (lom *LOM) UpgradeLock() (finished bool) { 602 nlc := lom.getLocker() 603 return nlc.UpgradeLock(lom.Uname()) 604 } 605 606 func (lom *LOM) DowngradeLock() { 607 nlc := lom.getLocker() 608 nlc.DowngradeLock(lom.Uname()) 609 } 610 611 func (lom *LOM) Unlock(exclusive bool) { 612 nlc := lom.getLocker() 613 nlc.Unlock(lom.Uname(), exclusive) 614 }