github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/core/lom_xattr.go (about) 1 // Package core provides core metadata and in-cluster API 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package core 6 7 import ( 8 "encoding/binary" 9 "errors" 10 "fmt" 11 "os" 12 "strings" 13 "syscall" 14 "time" 15 16 "github.com/NVIDIA/aistore/api/apc" 17 "github.com/NVIDIA/aistore/cmn" 18 "github.com/NVIDIA/aistore/cmn/cos" 19 "github.com/NVIDIA/aistore/cmn/debug" 20 "github.com/NVIDIA/aistore/cmn/nlog" 21 "github.com/NVIDIA/aistore/fs" 22 "github.com/NVIDIA/aistore/ios" 23 "github.com/NVIDIA/aistore/memsys" 24 "github.com/OneOfOne/xxhash" 25 ) 26 27 // On-disk metadata layout - changing any of this must be done with respect 28 // to backward compatibility (and with caution). 29 // 30 // | ------------------ PREAMBLE ----------------- | --- MD VALUES ---- | 31 // | --- 1 --- | ----- 1 ----- | -- [CKSUM LEN] -- | - [METADATA LEN] - | 32 // | version | checksum-type | checksum-value | ---- metadata ---- | 33 // 34 // * version - determines the layout version. Thanks to this we can be backward 35 // compatible and deprecate old versions if needed. 36 // * checksum-type - determines the checksum algorithm used to compute checksum 37 // of the metadata. 38 // * checksum-value - computed checksum of the metadata. The length of the checksum 39 // can vary depending on the checksum algorithm. 40 // * metadata - the rest of the layout. The content of the metadata can vary depending 41 // on the version of the layout. 42 43 // the one and only currently supported checksum type == xxhash; 44 // NOTE: adding more checksums will likely require a new cmn.MetaverLOM version 45 const mdCksumTyXXHash = 1 46 47 const ( 48 XattrLOM = "user.ais.lom" // on-disk xattr name 49 xattrMaxSize = memsys.MaxSmallSlabSize 50 DumpLomEnvVar = "AIS_DUMP_LOM" 51 ) 52 53 // packing format internal attrs 54 const ( 55 lomCksumType = iota 56 lomCksumValue 57 lomObjVersion 58 lomObjSize 59 lomObjCopies 60 lomCustomMD 61 ) 62 63 // packing format separators 64 const ( 65 copyFQNSepa = "\x00" 66 customMDSepa = "\x01" 67 recordSepa = "\xe3/\xbd" 68 lenRecSepa = len(recordSepa) 69 ) 70 71 const prefLen = 10 // 10B prefix [ version = 1 | checksum-type | 64-bit xxhash ] 72 73 const getxattr = "getxattr" // syscall 74 75 // used in tests 76 func (lom *LOM) AcquireAtimefs() error { 77 _, atime, err := ios.FinfoAtime(lom.FQN) 78 if err != nil { 79 return err 80 } 81 lom.md.Atime = atime 82 lom.md.atimefs = uint64(atime) 83 return nil 84 } 85 86 // NOTE: used in tests, ignores `dirty` 87 func (lom *LOM) LoadMetaFromFS() error { 88 _, atime, err := ios.FinfoAtime(lom.FQN) 89 if err != nil { 90 return err 91 } 92 if _, err := lom.lmfs(true); err != nil { 93 return err 94 } 95 lom.md.Atime = atime 96 lom.md.atimefs = uint64(atime) 97 return nil 98 } 99 100 func whingeLmeta(err error) (*lmeta, error) { 101 if cos.IsErrXattrNotFound(err) { 102 return nil, cmn.NewErrLmetaNotFound(err) 103 } 104 return nil, os.NewSyscallError(getxattr, err) 105 } 106 107 func (lom *LOM) lmfsReload(populate bool) (md *lmeta, err error) { 108 saved := lom.md.pushrt() 109 md, err = lom.lmfs(populate) 110 if err == nil { 111 md.poprt(saved) 112 } 113 return 114 } 115 116 func (lom *LOM) lmfs(populate bool) (md *lmeta, err error) { 117 var ( 118 size int64 119 read []byte 120 mdSize = g.maxLmeta.Load() 121 buf, slab = g.smm.AllocSize(mdSize) 122 ) 123 read, err = fs.GetXattrBuf(lom.FQN, XattrLOM, buf) 124 if err != nil { 125 slab.Free(buf) 126 if err != syscall.ERANGE { 127 return whingeLmeta(err) 128 } 129 debug.Assert(mdSize < xattrMaxSize) 130 // 2nd attempt: max-size 131 buf, slab = g.smm.AllocSize(xattrMaxSize) 132 read, err = fs.GetXattrBuf(lom.FQN, XattrLOM, buf) 133 if err != nil { 134 slab.Free(buf) 135 return whingeLmeta(err) 136 } 137 } 138 size = int64(len(read)) 139 if size == 0 { 140 nlog.Errorf("%s[%s]: ENOENT", lom, lom.FQN) 141 err = os.NewSyscallError(getxattr, syscall.ENOENT) 142 slab.Free(buf) 143 return 144 } 145 md = &lom.md 146 if !populate { 147 md = &lmeta{} 148 } 149 err = md.unmarshal(read) 150 if err == nil { 151 _recomputeMdSize(size, mdSize) 152 } else { 153 err = cmn.NewErrLmetaCorrupted(err) 154 } 155 slab.Free(buf) 156 return 157 } 158 159 func (lom *LOM) PersistMain() (err error) { 160 atime := lom.AtimeUnix() 161 debug.Assert(cos.IsValidAtime(atime)) 162 if atime < 0 /*prefetch*/ || !lom.WritePolicy().IsImmediate() /*write-never, write-delayed*/ { 163 lom.md.makeDirty() 164 lom.Recache() 165 return 166 } 167 // write-immediate (default) 168 buf := lom.marshal() 169 if err = fs.SetXattr(lom.FQN, XattrLOM, buf); err != nil { 170 lom.Uncache() 171 T.FSHC(err, lom.FQN) 172 } else { 173 lom.md.clearDirty() 174 lom.Recache() 175 } 176 g.smm.Free(buf) 177 return 178 } 179 180 // (caller must set atime; compare with the above) 181 func (lom *LOM) Persist() (err error) { 182 atime := lom.AtimeUnix() 183 debug.Assert(cos.IsValidAtime(atime), atime) 184 185 if atime < 0 || !lom.WritePolicy().IsImmediate() { 186 lom.md.makeDirty() 187 if lom.Bprops() != nil { 188 if !lom.IsCopy() { 189 lom.Recache() 190 } 191 lom.md.bckID = lom.Bprops().BID 192 } 193 return 194 } 195 196 buf := lom.marshal() 197 if err = fs.SetXattr(lom.FQN, XattrLOM, buf); err != nil { 198 lom.Uncache() 199 T.FSHC(err, lom.FQN) 200 } else { 201 lom.md.clearDirty() 202 if lom.Bprops() != nil { 203 if !lom.IsCopy() { 204 lom.Recache() 205 } 206 lom.md.bckID = lom.Bprops().BID 207 } 208 } 209 g.smm.Free(buf) 210 return 211 } 212 213 func (lom *LOM) persistMdOnCopies() (copyFQN string, err error) { 214 buf := lom.marshal() 215 // replicate across copies 216 for copyFQN = range lom.md.copies { 217 if copyFQN == lom.FQN { 218 continue 219 } 220 if err = fs.SetXattr(copyFQN, XattrLOM, buf); err != nil { 221 break 222 } 223 } 224 g.smm.Free(buf) 225 return 226 } 227 228 // NOTE: not clearing dirty flag as the caller will uncache anyway 229 func (lom *LOM) flushCold(md *lmeta, atime time.Time) { 230 if err := lom.flushAtime(atime); err != nil { 231 return 232 } 233 if !md.isDirty() || lom.WritePolicy() == apc.WriteNever { 234 return 235 } 236 lom.md = *md 237 if err := lom.syncMetaWithCopies(); err != nil { 238 return 239 } 240 buf := lom.marshal() 241 if err := fs.SetXattr(lom.FQN, XattrLOM, buf); err != nil { 242 T.FSHC(err, lom.FQN) 243 } 244 g.smm.Free(buf) 245 } 246 247 func (lom *LOM) flushAtime(atime time.Time) error { 248 finfo, err := os.Stat(lom.FQN) 249 if err != nil { 250 return err 251 } 252 mtime := finfo.ModTime() 253 return os.Chtimes(lom.FQN, atime, mtime) 254 } 255 256 func (lom *LOM) marshal() (buf []byte) { 257 lmsize := g.maxLmeta.Load() 258 buf = lom.md.marshal(lmsize) 259 size := int64(len(buf)) 260 debug.Assert(size <= xattrMaxSize) 261 _recomputeMdSize(size, lmsize) 262 return 263 } 264 265 func _recomputeMdSize(size, mdSize int64) { 266 const grow = memsys.SmallSlabIncStep 267 var nsize int64 268 if size > mdSize { 269 nsize = min(size+grow, xattrMaxSize) 270 g.maxLmeta.CAS(mdSize, nsize) 271 } else if mdSize == xattrMaxSize && size < xattrMaxSize-grow { 272 nsize = min(size+grow, (size+xattrMaxSize)/2) 273 g.maxLmeta.CAS(mdSize, nsize) 274 } 275 } 276 277 /////////// 278 // lmeta // 279 /////////// 280 281 const lomDirtyMask = uint64(1 << 63) 282 283 func (md *lmeta) makeDirty() { md.atimefs |= lomDirtyMask } 284 func (md *lmeta) clearDirty() { md.atimefs &= ^lomDirtyMask } 285 func (md *lmeta) isDirty() bool { return md.atimefs&lomDirtyMask == lomDirtyMask } 286 287 func (md *lmeta) pushrt() []uint64 { 288 return []uint64{uint64(md.Atime), md.atimefs, md.bckID} 289 } 290 291 func (md *lmeta) poprt(saved []uint64) { 292 md.Atime, md.atimefs, md.bckID = int64(saved[0]), saved[1], saved[2] 293 } 294 295 func (md *lmeta) unmarshal(buf []byte) error { 296 const invalid = "invalid lmeta" 297 var ( 298 payload string 299 expectedCksum, actualCksum uint64 300 cksumType, cksumValue string 301 haveSize, haveVersion, haveCopies bool 302 haveCksumType, haveCksumValue bool 303 last bool 304 ) 305 if len(buf) < prefLen { 306 return fmt.Errorf("%s: too short (%d)", invalid, len(buf)) 307 } 308 if buf[0] != cmn.MetaverLOM { 309 return fmt.Errorf("%s: unknown version %d", invalid, buf[0]) 310 } 311 if buf[1] != mdCksumTyXXHash { 312 return fmt.Errorf("%s: unknown checksum %d", invalid, buf[1]) 313 } 314 payload = string(buf[prefLen:]) 315 actualCksum = xxhash.Checksum64S(buf[prefLen:], cos.MLCG32) 316 expectedCksum = binary.BigEndian.Uint64(buf[2:]) 317 if expectedCksum != actualCksum { 318 return cos.NewErrMetaCksum(expectedCksum, actualCksum, md.String()) 319 } 320 321 for off := 0; !last; { 322 var ( 323 record string 324 i = strings.Index(payload[off:], recordSepa) 325 ) 326 if i < 0 { 327 record = payload[off:] 328 last = true 329 } else { 330 record = payload[off : off+i] 331 } 332 key := int(binary.BigEndian.Uint16([]byte(record))) 333 val := record[cos.SizeofI16:] 334 off += i + lenRecSepa 335 switch key { 336 case lomCksumValue: 337 if haveCksumValue { 338 return errors.New(invalid + " #1") 339 } 340 cksumValue = val 341 haveCksumValue = true 342 case lomCksumType: 343 if haveCksumType { 344 return errors.New(invalid + " #2") 345 } 346 cksumType = val 347 haveCksumType = true 348 case lomObjVersion: 349 if haveVersion { 350 return errors.New(invalid + " #3") 351 } 352 md.Ver = val 353 haveVersion = true 354 case lomObjSize: 355 if haveSize { 356 return errors.New(invalid + " #4") 357 } 358 md.Size = int64(binary.BigEndian.Uint64([]byte(val))) 359 haveSize = true 360 case lomObjCopies: 361 if haveCopies { 362 return errors.New(invalid + " #5") 363 } 364 copyFQNs := strings.Split(val, copyFQNSepa) 365 haveCopies = true 366 md.copies = make(fs.MPI, len(copyFQNs)) 367 for _, copyFQN := range copyFQNs { 368 if copyFQN == "" { 369 return errors.New(invalid + " #5.1") 370 } 371 372 mpathInfo, _, err := fs.FQN2Mpath(copyFQN) 373 if err != nil { 374 // Mountpath with the copy is missing. 375 if cmn.Rom.FastV(4, cos.SmoduleCluster) { 376 nlog.Warningln(err) 377 } 378 // For utilities and tests: fill the map with mpath names always 379 if os.Getenv(DumpLomEnvVar) != "" { 380 md.copies[copyFQN] = nil 381 } 382 continue 383 } 384 md.copies[copyFQN] = mpathInfo 385 } 386 case lomCustomMD: 387 entries := strings.Split(val, customMDSepa) 388 custom := make(cos.StrKVs, len(entries)/2) 389 for i := 0; i < len(entries); i += 2 { 390 custom[entries[i]] = entries[i+1] 391 } 392 md.SetCustomMD(custom) 393 default: 394 return errors.New(invalid + " #6") 395 } 396 } 397 if haveCksumType != haveCksumValue { 398 return errors.New(invalid + " #7") 399 } 400 md.Cksum = cos.NewCksum(cksumType, cksumValue) 401 if !haveSize { 402 return errors.New(invalid + " #8") 403 } 404 return nil 405 } 406 407 func (md *lmeta) marshal(mdSize int64) (buf []byte) { 408 var ( 409 b8 [cos.SizeofI64]byte 410 cksumType, cksumValue = md.Cksum.Get() 411 ) 412 buf, _ = g.smm.AllocSize(mdSize) 413 buf = buf[:prefLen] // hold it for md-xattr checksum (below) 414 415 // serialize 416 buf = _marshRecord(buf, lomCksumType, cksumType, true) 417 buf = _marshRecord(buf, lomCksumValue, cksumValue, true) 418 if md.Ver != "" { 419 buf = _marshRecord(buf, lomObjVersion, md.Ver, true) 420 } 421 binary.BigEndian.PutUint64(b8[:], uint64(md.Size)) 422 buf = _marshRecord(buf, lomObjSize, string(b8[:]), false) 423 if len(md.copies) > 0 { 424 buf = g.smm.Append(buf, recordSepa) 425 buf = _marshRecord(buf, lomObjCopies, "", false) 426 buf = _marshCopies(buf, md.copies) 427 } 428 if custom := md.GetCustomMD(); len(custom) > 0 { 429 buf = g.smm.Append(buf, recordSepa) 430 buf = _marshRecord(buf, lomCustomMD, "", false) 431 buf = _marshCustomMD(buf, custom) 432 } 433 434 // checksum, prepend, and return 435 buf[0] = cmn.MetaverLOM 436 buf[1] = mdCksumTyXXHash 437 mdCksumValue := xxhash.Checksum64S(buf[prefLen:], cos.MLCG32) 438 binary.BigEndian.PutUint64(buf[2:], mdCksumValue) 439 return 440 } 441 442 func _marshRecord(buf []byte, key int, value string, sepa bool) []byte { 443 var bkey [cos.SizeofI16]byte 444 binary.BigEndian.PutUint16(bkey[:], uint16(key)) 445 buf = g.smm.Append(buf, string(bkey[:])) 446 buf = g.smm.Append(buf, value) 447 if sepa { 448 buf = g.smm.Append(buf, recordSepa) 449 } 450 return buf 451 } 452 453 func _marshCopies(buf []byte, copies fs.MPI) []byte { 454 var ( 455 i int 456 num = len(copies) 457 ) 458 for copyFQN := range copies { 459 debug.Assert(copyFQN != "") 460 i++ 461 buf = g.smm.Append(buf, copyFQN) 462 if i < num { 463 buf = g.smm.Append(buf, copyFQNSepa) 464 } 465 } 466 return buf 467 } 468 469 func _marshCustomMD(buf []byte, md cos.StrKVs) []byte { 470 var ( 471 i int 472 num = len(md) 473 ) 474 for k, v := range md { 475 debug.Assert(k != "") 476 i++ 477 buf = g.smm.Append(buf, k) 478 buf = g.smm.Append(buf, customMDSepa) 479 buf = g.smm.Append(buf, v) 480 if i < num { 481 buf = g.smm.Append(buf, customMDSepa) 482 } 483 } 484 return buf 485 } 486 487 // copy atime IFF valid and more recent 488 func (md *lmeta) cpAtime(from *lmeta) { 489 if !cos.IsValidAtime(from.Atime) { 490 return 491 } 492 if !cos.IsValidAtime(md.Atime) || (md.Atime > 0 && md.Atime < from.Atime) { 493 md.Atime = from.Atime 494 } 495 }