github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/core/lom.go (about)

     1  // Package core provides core metadata and in-cluster API
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package core
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"runtime"
    12  	"strconv"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/NVIDIA/aistore/api/apc"
    18  	"github.com/NVIDIA/aistore/cmn"
    19  	"github.com/NVIDIA/aistore/cmn/atomic"
    20  	"github.com/NVIDIA/aistore/cmn/cos"
    21  	"github.com/NVIDIA/aistore/cmn/debug"
    22  	"github.com/NVIDIA/aistore/cmn/feat"
    23  	"github.com/NVIDIA/aistore/core/meta"
    24  	"github.com/NVIDIA/aistore/fs"
    25  	"github.com/NVIDIA/aistore/ios"
    26  	"github.com/NVIDIA/aistore/memsys"
    27  	"github.com/NVIDIA/aistore/transport"
    28  )
    29  
    30  // Local Object Metadata (LOM) is a locally stored object metadata comprising, in part:
    31  // - name, version, atime, checksum, size, etc. object attributes and flags
    32  // - runtime context including properties and configuration of the bucket
    33  //   that contains this LOM
    34  
    35  const (
    36  	lomInitialVersion = "1"
    37  )
    38  
    39  // core stats
    40  const (
    41  	RemoteDeletedDelCount = "remote.deleted.del.n"
    42  
    43  	// lcache stats
    44  	LcacheCollisionCount = "lcache.collision.n"
    45  	LcacheEvictedCount   = "lcache.evicted.n"
    46  	LcacheFlushColdCount = "lcache.flush.cold.n"
    47  )
    48  
    49  type (
    50  	lmeta struct {
    51  		copies fs.MPI
    52  		uname  string
    53  		cmn.ObjAttrs
    54  		atimefs uint64 // NOTE: high bit is reserved for `dirty`
    55  		bckID   uint64
    56  	}
    57  	LOM struct {
    58  		mi      *fs.Mountpath
    59  		bck     meta.Bck
    60  		ObjName string
    61  		FQN     string
    62  		HrwFQN  string // (=> main replica)
    63  		md      lmeta  // on-disk metadata
    64  		digest  uint64 // uname digest
    65  	}
    66  )
    67  
    68  type (
    69  	global struct {
    70  		tstats   cos.StatsUpdater // (stats.Trunner)
    71  		pmm, smm *memsys.MMSA
    72  		maxLmeta atomic.Int64
    73  		locker   nameLocker
    74  		lchk     lchk
    75  	}
    76  )
    77  
    78  var bckLocker nameLocker // common
    79  
    80  // target only
    81  var (
    82  	T Target
    83  	g global
    84  )
    85  
    86  // interface guard
    87  var (
    88  	_ cos.OAH     = (*LOM)(nil)
    89  	_ fs.PartsFQN = (*LOM)(nil)
    90  	_ lifUnlocker = (*LOM)(nil)
    91  )
    92  
    93  func Pinit() { bckLocker = newNameLocker() }
    94  
    95  func Tinit(t Target, tstats cos.StatsUpdater, runHK bool) {
    96  	bckLocker = newNameLocker()
    97  	T = t
    98  	{
    99  		g.maxLmeta.Store(xattrMaxSize)
   100  		g.locker = newNameLocker()
   101  		g.tstats = tstats
   102  		g.pmm = t.PageMM()
   103  		g.smm = t.ByteMM()
   104  	}
   105  	if runHK {
   106  		regLomCacheWithHK()
   107  	}
   108  }
   109  
   110  func Term() {
   111  	const sleep = time.Second >> 2 // total <= 2s
   112  	for i := 0; i < 8 && !g.lchk.running.CAS(false, true); i++ {
   113  		time.Sleep(sleep)
   114  	}
   115  	g.lchk.evictAll(termDuration)
   116  }
   117  
   118  /////////
   119  // LOM //
   120  /////////
   121  
   122  func (lom *LOM) ObjAttrs() *cmn.ObjAttrs { return &lom.md.ObjAttrs }
   123  
   124  // LOM == remote-object equality check
   125  func (lom *LOM) Equal(rem cos.OAH) bool { return lom.ObjAttrs().Equal(rem) }
   126  
   127  func (lom *LOM) CopyAttrs(oah cos.OAH, skipCksum bool) {
   128  	lom.md.ObjAttrs.CopyFrom(oah, skipCksum)
   129  }
   130  
   131  // special a) when a new version is being created b) for usage in unit tests
   132  func (lom *LOM) SizeBytes(special ...bool) int64 {
   133  	debug.Assert(len(special) > 0 || lom.loaded(), lom.String())
   134  	return lom.md.Size
   135  }
   136  
   137  func (lom *LOM) Version(special ...bool) string {
   138  	debug.Assert(len(special) > 0 || lom.loaded())
   139  	return lom.md.Ver
   140  }
   141  
   142  func (lom *LOM) Uname() string  { return lom.md.uname }
   143  func (lom *LOM) Digest() uint64 { return lom.digest }
   144  
   145  func (lom *LOM) SetSize(size int64)    { lom.md.Size = size }
   146  func (lom *LOM) SetVersion(ver string) { lom.md.Ver = ver }
   147  
   148  func (lom *LOM) Checksum() *cos.Cksum          { return lom.md.Cksum }
   149  func (lom *LOM) SetCksum(cksum *cos.Cksum)     { lom.md.Cksum = cksum }
   150  func (lom *LOM) EqCksum(cksum *cos.Cksum) bool { return lom.md.Cksum.Equal(cksum) }
   151  
   152  func (lom *LOM) Atime() time.Time      { return time.Unix(0, lom.md.Atime) }
   153  func (lom *LOM) AtimeUnix() int64      { return lom.md.Atime }
   154  func (lom *LOM) SetAtimeUnix(tu int64) { lom.md.Atime = tu }
   155  
   156  // custom metadata
   157  func (lom *LOM) GetCustomMD() cos.StrKVs   { return lom.md.GetCustomMD() }
   158  func (lom *LOM) SetCustomMD(md cos.StrKVs) { lom.md.SetCustomMD(md) }
   159  
   160  func (lom *LOM) GetCustomKey(key string) (string, bool) { return lom.md.GetCustomKey(key) }
   161  func (lom *LOM) SetCustomKey(key, value string)         { lom.md.SetCustomKey(key, value) }
   162  
   163  // lom <= transport.ObjHdr (NOTE: caller must call freeLOM)
   164  func AllocLomFromHdr(hdr *transport.ObjHdr) (lom *LOM, err error) {
   165  	lom = AllocLOM(hdr.ObjName)
   166  	if err = lom.InitBck(&hdr.Bck); err != nil {
   167  		return
   168  	}
   169  	lom.CopyAttrs(&hdr.ObjAttrs, false /*skip checksum*/)
   170  	return
   171  }
   172  
   173  func (lom *LOM) IsHRW() bool { return lom.HrwFQN == lom.FQN } // subj to resilvering
   174  
   175  func (lom *LOM) Bprops() *cmn.Bprops { return lom.bck.Props }
   176  
   177  // bprops accessors for convenience
   178  func (lom *LOM) ECEnabled() bool                { return lom.Bprops().EC.Enabled }
   179  func (lom *LOM) IsFeatureSet(f feat.Flags) bool { return lom.Bprops().Features.IsSet(f) }
   180  func (lom *LOM) MirrorConf() *cmn.MirrorConf    { return &lom.Bprops().Mirror }
   181  func (lom *LOM) CksumConf() *cmn.CksumConf      { return lom.bck.CksumConf() }
   182  func (lom *LOM) CksumType() string              { return lom.bck.CksumConf().Type }
   183  func (lom *LOM) VersionConf() cmn.VersionConf   { return lom.bck.VersionConf() }
   184  
   185  // as fs.PartsFQN
   186  func (lom *LOM) ObjectName() string       { return lom.ObjName }
   187  func (lom *LOM) Bck() *meta.Bck           { return &lom.bck }
   188  func (lom *LOM) Bucket() *cmn.Bck         { return (*cmn.Bck)(&lom.bck) }
   189  func (lom *LOM) Mountpath() *fs.Mountpath { return lom.mi }
   190  func (lom *LOM) Location() string         { return T.String() + apc.LocationPropSepa + lom.mi.String() }
   191  
   192  func ParseObjLoc(loc string) (tname, mpname string) {
   193  	i := strings.IndexByte(loc, apc.LocationPropSepa[0])
   194  	tname, mpname = loc[:i], loc[i+1:]
   195  	return
   196  }
   197  
   198  // see also: transport.ObjHdr.Cname()
   199  func (lom *LOM) Cname() string { return lom.bck.Cname(lom.ObjName) }
   200  
   201  func (lom *LOM) WritePolicy() (p apc.WritePolicy) {
   202  	if bprops := lom.Bprops(); bprops == nil {
   203  		p = apc.WriteImmediate
   204  	} else {
   205  		p = bprops.WritePolicy.MD
   206  	}
   207  	return
   208  }
   209  
   210  func (lom *LOM) loaded() bool { return lom.md.bckID != 0 }
   211  
   212  func (lom *LOM) HrwTarget(smap *meta.Smap) (tsi *meta.Snode, local bool, err error) {
   213  	tsi, err = smap.HrwHash2T(lom.digest)
   214  	if err != nil {
   215  		return
   216  	}
   217  	local = tsi.ID() == T.SID()
   218  	return
   219  }
   220  
   221  func (lom *LOM) IncVersion() error {
   222  	debug.Assert(lom.Bck().IsAIS())
   223  	if lom.md.Ver == "" {
   224  		lom.SetVersion(lomInitialVersion)
   225  		return nil
   226  	}
   227  	ver, err := strconv.Atoi(lom.md.Ver)
   228  	if err != nil {
   229  		return fmt.Errorf("%s: %v", lom, err)
   230  	}
   231  	lom.SetVersion(strconv.Itoa(ver + 1))
   232  	return nil
   233  }
   234  
   235  // Returns stored checksum (if present) and computed checksum (if requested)
   236  // MAY compute and store a missing (xxhash) checksum.
   237  // If xattr checksum is different than lom's metadata checksum, returns error
   238  // and do not recompute checksum even if recompute set to true.
   239  //
   240  // * objects are stored in the cluster with their content checksums and in accordance
   241  //   with their bucket configurations.
   242  // * xxhash is the system-default checksum.
   243  // * user can override the system default on a bucket level, by setting checksum=none.
   244  // * bucket (re)configuration can be done at any time.
   245  // * an object with a bad checksum cannot be retrieved (via GET) and cannot be replicated
   246  //   or migrated.
   247  // * GET and PUT operations support an option to validate checksums.
   248  // * validation is done against a checksum stored with an object (GET), or a checksum
   249  //   provided by a user (PUT).
   250  // * replications and migrations are always protected by checksums.
   251  // * when two objects in the cluster have identical (bucket, object) names and checksums,
   252  //   they are considered to be full replicas of each other.
   253  // ==============================================================================
   254  
   255  // ValidateMetaChecksum validates whether checksum stored in lom's in-memory metadata
   256  // matches checksum stored on disk.
   257  // Use lom.ValidateContentChecksum() to recompute and check object's content checksum.
   258  func (lom *LOM) ValidateMetaChecksum() error {
   259  	var (
   260  		md  *lmeta
   261  		err error
   262  	)
   263  	if lom.CksumType() == cos.ChecksumNone {
   264  		return nil
   265  	}
   266  	wmd := lom.WritePolicy()
   267  	if wmd == apc.WriteNever || (wmd == apc.WriteDelayed && lom.md.isDirty()) {
   268  		// cannot validate meta checksum
   269  		return nil
   270  	}
   271  	md, err = lom.lmfsReload(false)
   272  	if err != nil {
   273  		return err
   274  	}
   275  	if md == nil {
   276  		return fmt.Errorf("%s: no meta", lom)
   277  	}
   278  	if lom.md.Cksum == nil {
   279  		lom.SetCksum(md.Cksum)
   280  		return nil
   281  	}
   282  	// different versions may have different checksums
   283  	if md.Ver == lom.md.Ver && !lom.EqCksum(md.Cksum) {
   284  		err = cos.NewErrDataCksum(lom.md.Cksum, md.Cksum, lom.String())
   285  		lom.Uncache()
   286  	}
   287  	return err
   288  }
   289  
   290  // ValidateDiskChecksum validates if checksum stored in lom's in-memory metadata
   291  // matches object's content checksum.
   292  // Use lom.ValidateMetaChecksum() to check lom's checksum vs on-disk metadata.
   293  func (lom *LOM) ValidateContentChecksum() (err error) {
   294  	var (
   295  		cksumType = lom.CksumType()
   296  		cksums    = struct {
   297  			stor *cos.Cksum     // stored with LOM
   298  			comp *cos.CksumHash // computed
   299  		}{stor: lom.md.Cksum}
   300  		reloaded bool
   301  	)
   302  recomp:
   303  	if cksumType == cos.ChecksumNone { // as far as do-no-checksum-checking bucket rules
   304  		return
   305  	}
   306  	if !lom.md.Cksum.IsEmpty() {
   307  		cksumType = lom.md.Cksum.Ty() // takes precedence on the other hand
   308  	}
   309  	if cksums.comp, err = lom.ComputeCksum(cksumType); err != nil {
   310  		return
   311  	}
   312  	if lom.md.Cksum.IsEmpty() { // store computed
   313  		lom.md.Cksum = cksums.comp.Clone()
   314  		if !lom.loaded() {
   315  			lom.SetAtimeUnix(time.Now().UnixNano())
   316  		}
   317  		if err = lom.Persist(); err != nil {
   318  			lom.md.Cksum = cksums.stor
   319  		}
   320  		return
   321  	}
   322  	if cksums.comp.Equal(lom.md.Cksum) {
   323  		return
   324  	}
   325  	if reloaded {
   326  		goto ex
   327  	}
   328  	// retry: load from disk and check again
   329  	reloaded = true
   330  	if _, err = lom.lmfsReload(true); err == nil && lom.md.Cksum != nil {
   331  		// type changed - recompute
   332  		if cksumType != lom.md.Cksum.Ty() {
   333  			cksums.stor = lom.md.Cksum
   334  			cksumType = lom.CksumType()
   335  			goto recomp
   336  		}
   337  		// otherwise, check
   338  		if cksums.comp.Equal(lom.md.Cksum) {
   339  			return
   340  		}
   341  	}
   342  ex:
   343  	err = cos.NewErrDataCksum(&cksums.comp.Cksum, cksums.stor, lom.String())
   344  	lom.Uncache()
   345  	return
   346  }
   347  
   348  func (lom *LOM) ComputeSetCksum() (*cos.Cksum, error) {
   349  	var (
   350  		cksum          *cos.Cksum
   351  		cksumHash, err = lom.ComputeCksum(lom.CksumType())
   352  	)
   353  	if err != nil {
   354  		return nil, err
   355  	}
   356  	if cksumHash != nil {
   357  		cksum = cksumHash.Clone()
   358  	}
   359  	lom.SetCksum(cksum)
   360  	return cksum, nil
   361  }
   362  
   363  func (lom *LOM) ComputeCksum(cksumType string) (cksum *cos.CksumHash, err error) {
   364  	var file *os.File
   365  	if cksumType == cos.ChecksumNone {
   366  		return
   367  	}
   368  	if file, err = lom.OpenFile(); err != nil {
   369  		return
   370  	}
   371  	// No need to allocate `buf` as `io.Discard` has efficient `io.ReaderFrom` implementation.
   372  	_, cksum, err = cos.CopyAndChecksum(io.Discard, file, nil, cksumType)
   373  	cos.Close(file)
   374  	if err != nil {
   375  		return nil, err
   376  	}
   377  	return
   378  }
   379  
   380  // no lock is taken when locked by an immediate caller, or otherwise is known to be locked
   381  // otherwise, try Rlock temporarily _if and only when_ reading from fs
   382  //
   383  // (compare w/ LoadUnsafe() below)
   384  func (lom *LOM) Load(cacheit, locked bool) error {
   385  	var (
   386  		lcache, lmd = lom.fromCache()
   387  		bmd         = T.Bowner().Get()
   388  	)
   389  	// fast path
   390  	if lmd != nil {
   391  		lom.md = *lmd
   392  		return lom._checkBucket(bmd)
   393  	}
   394  
   395  	// slow path
   396  	if !locked && lom.TryLock(false) {
   397  		defer lom.Unlock(false)
   398  	}
   399  	if err := lom.FromFS(); err != nil {
   400  		return err
   401  	}
   402  	bid := lom.Bprops().BID
   403  	debug.Assert(bid != 0, lom.Cname())
   404  	if bid == 0 {
   405  		return nil
   406  	}
   407  	lom.md.bckID = bid
   408  	if err := lom._checkBucket(bmd); err != nil {
   409  		return err
   410  	}
   411  	if cacheit && lcache != nil {
   412  		md := lom.md
   413  		lcache.Store(lom.digest, &md)
   414  	}
   415  	return nil
   416  }
   417  
   418  func (lom *LOM) _checkBucket(bmd *meta.BMD) (err error) {
   419  	bck, bckID := &lom.bck, lom.md.bckID
   420  	debug.Assert(bckID != 0)
   421  	bprops, present := bmd.Get(bck)
   422  	if !present {
   423  		if bck.IsRemote() {
   424  			return cmn.NewErrRemoteBckNotFound(bck.Bucket())
   425  		}
   426  		return cmn.NewErrBckNotFound(bck.Bucket())
   427  	}
   428  	if bckID == bprops.BID {
   429  		return nil // ok
   430  	}
   431  	err = cmn.NewErrObjDefunct(lom.String(), lom.md.bckID, lom.bck.Props.BID)
   432  	return
   433  }
   434  
   435  // usage: fast (and unsafe) loading object metadata except atime - no locks
   436  // compare with conventional Load() above
   437  func (lom *LOM) LoadUnsafe() (err error) {
   438  	var (
   439  		_, lmd = lom.fromCache()
   440  		bmd    = T.Bowner().Get()
   441  	)
   442  	// fast path
   443  	if lmd != nil {
   444  		lom.md = *lmd
   445  		err = lom._checkBucket(bmd)
   446  		return
   447  	}
   448  	// read and decode xattr; NOTE: fs.GetXattr* vs fs.SetXattr race possible and must be
   449  	// either a) handled or b) benign from the caller's perspective
   450  	if _, err = lom.lmfs(true); err != nil {
   451  		return
   452  	}
   453  	// check bucket
   454  	bid := lom.Bprops().BID
   455  	debug.Assert(bid != 0, lom.Cname())
   456  	if bid == 0 {
   457  		return
   458  	}
   459  	lom.md.bckID = bid
   460  	return lom._checkBucket(bmd)
   461  }
   462  
   463  //
   464  // lom cache -------------------------------------------------------------
   465  //
   466  
   467  // store new or refresh existing
   468  func (lom *LOM) Recache() {
   469  	debug.Assert(!lom.IsCopy())
   470  	md := lom.md
   471  	bid := lom.Bprops().BID
   472  	debug.Assert(bid != 0)
   473  	md.bckID, lom.md.bckID = bid, bid
   474  
   475  	lcache := lom.lcache()
   476  	val, ok := lcache.Swap(lom.digest, &md)
   477  	if !ok {
   478  		return
   479  	}
   480  	lmd := val.(*lmeta)
   481  	if lmd.uname != lom.md.uname {
   482  		g.tstats.Inc(LcacheCollisionCount) // target stats
   483  	} else {
   484  		// updating the value that's already in the map (race extremely unlikely, benign anyway)
   485  		md.cpAtime(lmd)
   486  	}
   487  }
   488  
   489  func (lom *LOM) Uncache() {
   490  	lcache := lom.lcache()
   491  	md, ok := lcache.LoadAndDelete(lom.digest)
   492  	if !ok {
   493  		return
   494  	}
   495  	lmd := md.(*lmeta)
   496  	if lmd.uname != lom.md.uname {
   497  		g.tstats.Inc(LcacheCollisionCount) // target stats
   498  	} else {
   499  		lom.md.cpAtime(lmd)
   500  	}
   501  }
   502  
   503  // remove from cache unless dirty
   504  func (lom *LOM) UncacheUnless() {
   505  	lcache, lmd := lom.fromCache()
   506  	if lmd == nil {
   507  		return
   508  	}
   509  	if !lmd.isDirty() {
   510  		lom.md.cpAtime(lmd)
   511  		lcache.Delete(lom.md.uname)
   512  	}
   513  }
   514  
   515  func (lom *LOM) CacheIdx() int     { return fs.LcacheIdx(lom.digest) } // (lif.CacheIdx())
   516  func (lom *LOM) lcache() *sync.Map { return lom.mi.LomCache(lom.CacheIdx()) }
   517  
   518  func (lom *LOM) fromCache() (lcache *sync.Map, lmd *lmeta) {
   519  	lcache = lom.lcache()
   520  	if md, ok := lcache.Load(lom.digest); ok {
   521  		lmd = md.(*lmeta)
   522  		if lmd.uname != lom.md.uname {
   523  			g.tstats.Inc(LcacheCollisionCount) // target stats
   524  		}
   525  	}
   526  	return
   527  }
   528  
   529  func (lom *LOM) FromFS() error {
   530  	finfo, atimefs, err := ios.FinfoAtime(lom.FQN)
   531  	if err != nil {
   532  		if !os.IsNotExist(err) {
   533  			err = os.NewSyscallError("stat", err)
   534  			T.FSHC(err, lom.FQN)
   535  		}
   536  		return err
   537  	}
   538  	if _, err = lom.lmfs(true); err != nil {
   539  		// retry once
   540  		if cmn.IsErrLmetaNotFound(err) {
   541  			runtime.Gosched()
   542  			_, err = lom.lmfs(true)
   543  		}
   544  	}
   545  	if err != nil {
   546  		if !cmn.IsErrLmetaNotFound(err) {
   547  			T.FSHC(err, lom.FQN)
   548  		}
   549  		return err
   550  	}
   551  	// fstat & atime
   552  	if lom.md.Size != finfo.Size() { // corruption or tampering
   553  		return cmn.NewErrLmetaCorrupted(lom.whingeSize(finfo.Size()))
   554  	}
   555  	lom.md.Atime = atimefs
   556  	lom.md.atimefs = uint64(atimefs)
   557  	return nil
   558  }
   559  
   560  func (lom *LOM) whingeSize(size int64) error {
   561  	return fmt.Errorf("errsize (%d != %d)", lom.md.Size, size)
   562  }
   563  
   564  func lomCaches() []*sync.Map {
   565  	var (
   566  		i              int
   567  		availablePaths = fs.GetAvail()
   568  		cachesCnt      = len(availablePaths) * cos.MultiSyncMapCount
   569  		caches         = make([]*sync.Map, cachesCnt)
   570  	)
   571  	for _, mi := range availablePaths {
   572  		for idx := range cos.MultiSyncMapCount {
   573  			caches[i] = mi.LomCache(idx)
   574  			i++
   575  		}
   576  	}
   577  	return caches
   578  }
   579  
   580  //
   581  // lock/unlock ------------------------------------------
   582  //
   583  
   584  func (lom *LOM) getLocker() *nlc { return &g.locker[lom.CacheIdx()] } // (lif.getLocker())
   585  
   586  func (lom *LOM) IsLocked() (int /*rc*/, bool /*exclusive*/) {
   587  	nlc := lom.getLocker()
   588  	return nlc.IsLocked(lom.Uname())
   589  }
   590  
   591  func (lom *LOM) TryLock(exclusive bool) bool {
   592  	nlc := lom.getLocker()
   593  	return nlc.TryLock(lom.Uname(), exclusive)
   594  }
   595  
   596  func (lom *LOM) Lock(exclusive bool) {
   597  	nlc := lom.getLocker()
   598  	nlc.Lock(lom.Uname(), exclusive)
   599  }
   600  
   601  func (lom *LOM) UpgradeLock() (finished bool) {
   602  	nlc := lom.getLocker()
   603  	return nlc.UpgradeLock(lom.Uname())
   604  }
   605  
   606  func (lom *LOM) DowngradeLock() {
   607  	nlc := lom.getLocker()
   608  	nlc.DowngradeLock(lom.Uname())
   609  }
   610  
   611  func (lom *LOM) Unlock(exclusive bool) {
   612  	nlc := lom.getLocker()
   613  	nlc.Unlock(lom.Uname(), exclusive)
   614  }