github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/core/lom_xattr.go (about)

     1  // Package core provides core metadata and in-cluster API
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package core
     6  
     7  import (
     8  	"encoding/binary"
     9  	"errors"
    10  	"fmt"
    11  	"os"
    12  	"strings"
    13  	"syscall"
    14  	"time"
    15  
    16  	"github.com/NVIDIA/aistore/api/apc"
    17  	"github.com/NVIDIA/aistore/cmn"
    18  	"github.com/NVIDIA/aistore/cmn/cos"
    19  	"github.com/NVIDIA/aistore/cmn/debug"
    20  	"github.com/NVIDIA/aistore/cmn/nlog"
    21  	"github.com/NVIDIA/aistore/fs"
    22  	"github.com/NVIDIA/aistore/ios"
    23  	"github.com/NVIDIA/aistore/memsys"
    24  	"github.com/OneOfOne/xxhash"
    25  )
    26  
    27  // On-disk metadata layout - changing any of this must be done with respect
    28  // to backward compatibility (and with caution).
    29  //
    30  // | ------------------ PREAMBLE ----------------- | --- MD VALUES ---- |
    31  // | --- 1 --- | ----- 1 ----- | -- [CKSUM LEN] -- | - [METADATA LEN] - |
    32  // |  version  | checksum-type |   checksum-value  | ---- metadata ---- |
    33  //
    34  // * version - determines the layout version. Thanks to this we can be backward
    35  //   compatible and deprecate old versions if needed.
    36  // * checksum-type - determines the checksum algorithm used to compute checksum
    37  //   of the metadata.
    38  // * checksum-value - computed checksum of the metadata. The length of the checksum
    39  //   can vary depending on the checksum algorithm.
    40  // * metadata - the rest of the layout. The content of the metadata can vary depending
    41  //   on the version of the layout.
    42  
    43  // the one and only currently supported checksum type == xxhash;
    44  // NOTE: adding more checksums will likely require a new cmn.MetaverLOM version
    45  const mdCksumTyXXHash = 1
    46  
    47  const (
    48  	XattrLOM      = "user.ais.lom" // on-disk xattr name
    49  	xattrMaxSize  = memsys.MaxSmallSlabSize
    50  	DumpLomEnvVar = "AIS_DUMP_LOM"
    51  )
    52  
    53  // packing format internal attrs
    54  const (
    55  	lomCksumType = iota
    56  	lomCksumValue
    57  	lomObjVersion
    58  	lomObjSize
    59  	lomObjCopies
    60  	lomCustomMD
    61  )
    62  
    63  // packing format separators
    64  const (
    65  	copyFQNSepa  = "\x00"
    66  	customMDSepa = "\x01"
    67  	recordSepa   = "\xe3/\xbd"
    68  	lenRecSepa   = len(recordSepa)
    69  )
    70  
    71  const prefLen = 10 // 10B prefix [ version = 1 | checksum-type | 64-bit xxhash ]
    72  
    73  const getxattr = "getxattr" // syscall
    74  
    75  // used in tests
    76  func (lom *LOM) AcquireAtimefs() error {
    77  	_, atime, err := ios.FinfoAtime(lom.FQN)
    78  	if err != nil {
    79  		return err
    80  	}
    81  	lom.md.Atime = atime
    82  	lom.md.atimefs = uint64(atime)
    83  	return nil
    84  }
    85  
    86  // NOTE: used in tests, ignores `dirty`
    87  func (lom *LOM) LoadMetaFromFS() error {
    88  	_, atime, err := ios.FinfoAtime(lom.FQN)
    89  	if err != nil {
    90  		return err
    91  	}
    92  	if _, err := lom.lmfs(true); err != nil {
    93  		return err
    94  	}
    95  	lom.md.Atime = atime
    96  	lom.md.atimefs = uint64(atime)
    97  	return nil
    98  }
    99  
   100  func whingeLmeta(err error) (*lmeta, error) {
   101  	if cos.IsErrXattrNotFound(err) {
   102  		return nil, cmn.NewErrLmetaNotFound(err)
   103  	}
   104  	return nil, os.NewSyscallError(getxattr, err)
   105  }
   106  
   107  func (lom *LOM) lmfsReload(populate bool) (md *lmeta, err error) {
   108  	saved := lom.md.pushrt()
   109  	md, err = lom.lmfs(populate)
   110  	if err == nil {
   111  		md.poprt(saved)
   112  	}
   113  	return
   114  }
   115  
   116  func (lom *LOM) lmfs(populate bool) (md *lmeta, err error) {
   117  	var (
   118  		size      int64
   119  		read      []byte
   120  		mdSize    = g.maxLmeta.Load()
   121  		buf, slab = g.smm.AllocSize(mdSize)
   122  	)
   123  	read, err = fs.GetXattrBuf(lom.FQN, XattrLOM, buf)
   124  	if err != nil {
   125  		slab.Free(buf)
   126  		if err != syscall.ERANGE {
   127  			return whingeLmeta(err)
   128  		}
   129  		debug.Assert(mdSize < xattrMaxSize)
   130  		// 2nd attempt: max-size
   131  		buf, slab = g.smm.AllocSize(xattrMaxSize)
   132  		read, err = fs.GetXattrBuf(lom.FQN, XattrLOM, buf)
   133  		if err != nil {
   134  			slab.Free(buf)
   135  			return whingeLmeta(err)
   136  		}
   137  	}
   138  	size = int64(len(read))
   139  	if size == 0 {
   140  		nlog.Errorf("%s[%s]: ENOENT", lom, lom.FQN)
   141  		err = os.NewSyscallError(getxattr, syscall.ENOENT)
   142  		slab.Free(buf)
   143  		return
   144  	}
   145  	md = &lom.md
   146  	if !populate {
   147  		md = &lmeta{}
   148  	}
   149  	err = md.unmarshal(read)
   150  	if err == nil {
   151  		_recomputeMdSize(size, mdSize)
   152  	} else {
   153  		err = cmn.NewErrLmetaCorrupted(err)
   154  	}
   155  	slab.Free(buf)
   156  	return
   157  }
   158  
   159  func (lom *LOM) PersistMain() (err error) {
   160  	atime := lom.AtimeUnix()
   161  	debug.Assert(cos.IsValidAtime(atime))
   162  	if atime < 0 /*prefetch*/ || !lom.WritePolicy().IsImmediate() /*write-never, write-delayed*/ {
   163  		lom.md.makeDirty()
   164  		lom.Recache()
   165  		return
   166  	}
   167  	// write-immediate (default)
   168  	buf := lom.marshal()
   169  	if err = fs.SetXattr(lom.FQN, XattrLOM, buf); err != nil {
   170  		lom.Uncache()
   171  		T.FSHC(err, lom.FQN)
   172  	} else {
   173  		lom.md.clearDirty()
   174  		lom.Recache()
   175  	}
   176  	g.smm.Free(buf)
   177  	return
   178  }
   179  
   180  // (caller must set atime; compare with the above)
   181  func (lom *LOM) Persist() (err error) {
   182  	atime := lom.AtimeUnix()
   183  	debug.Assert(cos.IsValidAtime(atime), atime)
   184  
   185  	if atime < 0 || !lom.WritePolicy().IsImmediate() {
   186  		lom.md.makeDirty()
   187  		if lom.Bprops() != nil {
   188  			if !lom.IsCopy() {
   189  				lom.Recache()
   190  			}
   191  			lom.md.bckID = lom.Bprops().BID
   192  		}
   193  		return
   194  	}
   195  
   196  	buf := lom.marshal()
   197  	if err = fs.SetXattr(lom.FQN, XattrLOM, buf); err != nil {
   198  		lom.Uncache()
   199  		T.FSHC(err, lom.FQN)
   200  	} else {
   201  		lom.md.clearDirty()
   202  		if lom.Bprops() != nil {
   203  			if !lom.IsCopy() {
   204  				lom.Recache()
   205  			}
   206  			lom.md.bckID = lom.Bprops().BID
   207  		}
   208  	}
   209  	g.smm.Free(buf)
   210  	return
   211  }
   212  
   213  func (lom *LOM) persistMdOnCopies() (copyFQN string, err error) {
   214  	buf := lom.marshal()
   215  	// replicate across copies
   216  	for copyFQN = range lom.md.copies {
   217  		if copyFQN == lom.FQN {
   218  			continue
   219  		}
   220  		if err = fs.SetXattr(copyFQN, XattrLOM, buf); err != nil {
   221  			break
   222  		}
   223  	}
   224  	g.smm.Free(buf)
   225  	return
   226  }
   227  
   228  // NOTE: not clearing dirty flag as the caller will uncache anyway
   229  func (lom *LOM) flushCold(md *lmeta, atime time.Time) {
   230  	if err := lom.flushAtime(atime); err != nil {
   231  		return
   232  	}
   233  	if !md.isDirty() || lom.WritePolicy() == apc.WriteNever {
   234  		return
   235  	}
   236  	lom.md = *md
   237  	if err := lom.syncMetaWithCopies(); err != nil {
   238  		return
   239  	}
   240  	buf := lom.marshal()
   241  	if err := fs.SetXattr(lom.FQN, XattrLOM, buf); err != nil {
   242  		T.FSHC(err, lom.FQN)
   243  	}
   244  	g.smm.Free(buf)
   245  }
   246  
   247  func (lom *LOM) flushAtime(atime time.Time) error {
   248  	finfo, err := os.Stat(lom.FQN)
   249  	if err != nil {
   250  		return err
   251  	}
   252  	mtime := finfo.ModTime()
   253  	return os.Chtimes(lom.FQN, atime, mtime)
   254  }
   255  
   256  func (lom *LOM) marshal() (buf []byte) {
   257  	lmsize := g.maxLmeta.Load()
   258  	buf = lom.md.marshal(lmsize)
   259  	size := int64(len(buf))
   260  	debug.Assert(size <= xattrMaxSize)
   261  	_recomputeMdSize(size, lmsize)
   262  	return
   263  }
   264  
   265  func _recomputeMdSize(size, mdSize int64) {
   266  	const grow = memsys.SmallSlabIncStep
   267  	var nsize int64
   268  	if size > mdSize {
   269  		nsize = min(size+grow, xattrMaxSize)
   270  		g.maxLmeta.CAS(mdSize, nsize)
   271  	} else if mdSize == xattrMaxSize && size < xattrMaxSize-grow {
   272  		nsize = min(size+grow, (size+xattrMaxSize)/2)
   273  		g.maxLmeta.CAS(mdSize, nsize)
   274  	}
   275  }
   276  
   277  ///////////
   278  // lmeta //
   279  ///////////
   280  
   281  const lomDirtyMask = uint64(1 << 63)
   282  
   283  func (md *lmeta) makeDirty()    { md.atimefs |= lomDirtyMask }
   284  func (md *lmeta) clearDirty()   { md.atimefs &= ^lomDirtyMask }
   285  func (md *lmeta) isDirty() bool { return md.atimefs&lomDirtyMask == lomDirtyMask }
   286  
   287  func (md *lmeta) pushrt() []uint64 {
   288  	return []uint64{uint64(md.Atime), md.atimefs, md.bckID}
   289  }
   290  
   291  func (md *lmeta) poprt(saved []uint64) {
   292  	md.Atime, md.atimefs, md.bckID = int64(saved[0]), saved[1], saved[2]
   293  }
   294  
   295  func (md *lmeta) unmarshal(buf []byte) error {
   296  	const invalid = "invalid lmeta"
   297  	var (
   298  		payload                           string
   299  		expectedCksum, actualCksum        uint64
   300  		cksumType, cksumValue             string
   301  		haveSize, haveVersion, haveCopies bool
   302  		haveCksumType, haveCksumValue     bool
   303  		last                              bool
   304  	)
   305  	if len(buf) < prefLen {
   306  		return fmt.Errorf("%s: too short (%d)", invalid, len(buf))
   307  	}
   308  	if buf[0] != cmn.MetaverLOM {
   309  		return fmt.Errorf("%s: unknown version %d", invalid, buf[0])
   310  	}
   311  	if buf[1] != mdCksumTyXXHash {
   312  		return fmt.Errorf("%s: unknown checksum %d", invalid, buf[1])
   313  	}
   314  	payload = string(buf[prefLen:])
   315  	actualCksum = xxhash.Checksum64S(buf[prefLen:], cos.MLCG32)
   316  	expectedCksum = binary.BigEndian.Uint64(buf[2:])
   317  	if expectedCksum != actualCksum {
   318  		return cos.NewErrMetaCksum(expectedCksum, actualCksum, md.String())
   319  	}
   320  
   321  	for off := 0; !last; {
   322  		var (
   323  			record string
   324  			i      = strings.Index(payload[off:], recordSepa)
   325  		)
   326  		if i < 0 {
   327  			record = payload[off:]
   328  			last = true
   329  		} else {
   330  			record = payload[off : off+i]
   331  		}
   332  		key := int(binary.BigEndian.Uint16([]byte(record)))
   333  		val := record[cos.SizeofI16:]
   334  		off += i + lenRecSepa
   335  		switch key {
   336  		case lomCksumValue:
   337  			if haveCksumValue {
   338  				return errors.New(invalid + " #1")
   339  			}
   340  			cksumValue = val
   341  			haveCksumValue = true
   342  		case lomCksumType:
   343  			if haveCksumType {
   344  				return errors.New(invalid + " #2")
   345  			}
   346  			cksumType = val
   347  			haveCksumType = true
   348  		case lomObjVersion:
   349  			if haveVersion {
   350  				return errors.New(invalid + " #3")
   351  			}
   352  			md.Ver = val
   353  			haveVersion = true
   354  		case lomObjSize:
   355  			if haveSize {
   356  				return errors.New(invalid + " #4")
   357  			}
   358  			md.Size = int64(binary.BigEndian.Uint64([]byte(val)))
   359  			haveSize = true
   360  		case lomObjCopies:
   361  			if haveCopies {
   362  				return errors.New(invalid + " #5")
   363  			}
   364  			copyFQNs := strings.Split(val, copyFQNSepa)
   365  			haveCopies = true
   366  			md.copies = make(fs.MPI, len(copyFQNs))
   367  			for _, copyFQN := range copyFQNs {
   368  				if copyFQN == "" {
   369  					return errors.New(invalid + " #5.1")
   370  				}
   371  
   372  				mpathInfo, _, err := fs.FQN2Mpath(copyFQN)
   373  				if err != nil {
   374  					// Mountpath with the copy is missing.
   375  					if cmn.Rom.FastV(4, cos.SmoduleCluster) {
   376  						nlog.Warningln(err)
   377  					}
   378  					// For utilities and tests: fill the map with mpath names always
   379  					if os.Getenv(DumpLomEnvVar) != "" {
   380  						md.copies[copyFQN] = nil
   381  					}
   382  					continue
   383  				}
   384  				md.copies[copyFQN] = mpathInfo
   385  			}
   386  		case lomCustomMD:
   387  			entries := strings.Split(val, customMDSepa)
   388  			custom := make(cos.StrKVs, len(entries)/2)
   389  			for i := 0; i < len(entries); i += 2 {
   390  				custom[entries[i]] = entries[i+1]
   391  			}
   392  			md.SetCustomMD(custom)
   393  		default:
   394  			return errors.New(invalid + " #6")
   395  		}
   396  	}
   397  	if haveCksumType != haveCksumValue {
   398  		return errors.New(invalid + " #7")
   399  	}
   400  	md.Cksum = cos.NewCksum(cksumType, cksumValue)
   401  	if !haveSize {
   402  		return errors.New(invalid + " #8")
   403  	}
   404  	return nil
   405  }
   406  
   407  func (md *lmeta) marshal(mdSize int64) (buf []byte) {
   408  	var (
   409  		b8                    [cos.SizeofI64]byte
   410  		cksumType, cksumValue = md.Cksum.Get()
   411  	)
   412  	buf, _ = g.smm.AllocSize(mdSize)
   413  	buf = buf[:prefLen] // hold it for md-xattr checksum (below)
   414  
   415  	// serialize
   416  	buf = _marshRecord(buf, lomCksumType, cksumType, true)
   417  	buf = _marshRecord(buf, lomCksumValue, cksumValue, true)
   418  	if md.Ver != "" {
   419  		buf = _marshRecord(buf, lomObjVersion, md.Ver, true)
   420  	}
   421  	binary.BigEndian.PutUint64(b8[:], uint64(md.Size))
   422  	buf = _marshRecord(buf, lomObjSize, string(b8[:]), false)
   423  	if len(md.copies) > 0 {
   424  		buf = g.smm.Append(buf, recordSepa)
   425  		buf = _marshRecord(buf, lomObjCopies, "", false)
   426  		buf = _marshCopies(buf, md.copies)
   427  	}
   428  	if custom := md.GetCustomMD(); len(custom) > 0 {
   429  		buf = g.smm.Append(buf, recordSepa)
   430  		buf = _marshRecord(buf, lomCustomMD, "", false)
   431  		buf = _marshCustomMD(buf, custom)
   432  	}
   433  
   434  	// checksum, prepend, and return
   435  	buf[0] = cmn.MetaverLOM
   436  	buf[1] = mdCksumTyXXHash
   437  	mdCksumValue := xxhash.Checksum64S(buf[prefLen:], cos.MLCG32)
   438  	binary.BigEndian.PutUint64(buf[2:], mdCksumValue)
   439  	return
   440  }
   441  
   442  func _marshRecord(buf []byte, key int, value string, sepa bool) []byte {
   443  	var bkey [cos.SizeofI16]byte
   444  	binary.BigEndian.PutUint16(bkey[:], uint16(key))
   445  	buf = g.smm.Append(buf, string(bkey[:]))
   446  	buf = g.smm.Append(buf, value)
   447  	if sepa {
   448  		buf = g.smm.Append(buf, recordSepa)
   449  	}
   450  	return buf
   451  }
   452  
   453  func _marshCopies(buf []byte, copies fs.MPI) []byte {
   454  	var (
   455  		i   int
   456  		num = len(copies)
   457  	)
   458  	for copyFQN := range copies {
   459  		debug.Assert(copyFQN != "")
   460  		i++
   461  		buf = g.smm.Append(buf, copyFQN)
   462  		if i < num {
   463  			buf = g.smm.Append(buf, copyFQNSepa)
   464  		}
   465  	}
   466  	return buf
   467  }
   468  
   469  func _marshCustomMD(buf []byte, md cos.StrKVs) []byte {
   470  	var (
   471  		i   int
   472  		num = len(md)
   473  	)
   474  	for k, v := range md {
   475  		debug.Assert(k != "")
   476  		i++
   477  		buf = g.smm.Append(buf, k)
   478  		buf = g.smm.Append(buf, customMDSepa)
   479  		buf = g.smm.Append(buf, v)
   480  		if i < num {
   481  			buf = g.smm.Append(buf, customMDSepa)
   482  		}
   483  	}
   484  	return buf
   485  }
   486  
   487  // copy atime IFF valid and more recent
   488  func (md *lmeta) cpAtime(from *lmeta) {
   489  	if !cos.IsValidAtime(from.Atime) {
   490  		return
   491  	}
   492  	if !cos.IsValidAtime(md.Atime) || (md.Atime > 0 && md.Atime < from.Atime) {
   493  		md.Atime = from.Atime
   494  	}
   495  }