github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/bucketmeta.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "fmt" 9 "net/http" 10 "net/textproto" 11 "os" 12 "path/filepath" 13 "strconv" 14 "sync" 15 ratomic "sync/atomic" 16 "time" 17 18 "github.com/NVIDIA/aistore/api/apc" 19 "github.com/NVIDIA/aistore/cmn" 20 "github.com/NVIDIA/aistore/cmn/cos" 21 "github.com/NVIDIA/aistore/cmn/debug" 22 "github.com/NVIDIA/aistore/cmn/fname" 23 "github.com/NVIDIA/aistore/cmn/jsp" 24 "github.com/NVIDIA/aistore/cmn/nlog" 25 "github.com/NVIDIA/aistore/core" 26 "github.com/NVIDIA/aistore/core/meta" 27 "github.com/NVIDIA/aistore/fs" 28 "github.com/NVIDIA/aistore/memsys" 29 ) 30 31 // NOTE: to access bucket metadata and related structures, external 32 // packages and HTTP clients must import aistore/cluster (and not ais) 33 34 // - bucketMD is a server-side extension of the meta.BMD 35 // - bucketMD represents buckets (that store objects) and associated metadata 36 // - bucketMD (instance) can be obtained via bmdOwner.get() 37 // - bucketMD is immutable and versioned 38 // - bucketMD versioning is monotonic and incremental 39 // 40 // - bucketMD typical update transaction: 41 // lock -- clone() -- modify the clone -- bmdOwner.put(clone) -- unlock 42 // 43 // (*) for merges and conflict resolution, check the current version prior to put() 44 // (note that version check must be protected by the same critical section) 45 // 46 47 const bmdCopies = 2 // local copies 48 49 type ( 50 bucketMD struct { 51 cksum *cos.Cksum // BMD checksum 52 _sgl *memsys.SGL // jsp-formatted 53 vstr string // itoa(Version), to have it handy for http redirects 54 meta.BMD 55 } 56 bmdOwner interface { 57 sync.Locker 58 Get() *meta.BMD 59 60 init() bool // true when loaded previous version 61 get() (bmd *bucketMD) 62 putPersist(bmd *bucketMD, payload msPayload) error 63 persist(clone *bucketMD, payload msPayload) error 64 modify(*bmdModifier) (*bucketMD, error) 65 } 66 bmdOwnerBase struct { 67 bmd ratomic.Pointer[bucketMD] 68 sync.Mutex 69 } 70 bmdOwnerPrx struct { 71 bmdOwnerBase 72 fpath string 73 } 74 bmdOwnerTgt struct{ bmdOwnerBase } 75 76 bmdModifier struct { 77 pre func(*bmdModifier, *bucketMD) error 78 final func(*bmdModifier, *bucketMD) 79 80 msg *apc.ActMsg 81 txnID string // transaction UUID 82 bcks []*meta.Bck 83 84 propsToUpdate *cmn.BpropsToSet // update existing props 85 revertProps *cmn.BpropsToSet // props to revert 86 setProps *cmn.Bprops // new props to set 87 88 wait bool 89 needReMirror bool 90 needReEC bool 91 terminate bool 92 singleTarget bool 93 } 94 ) 95 96 // interface guard 97 var ( 98 _ revs = (*bucketMD)(nil) 99 _ meta.Bowner = (*bmdOwnerBase)(nil) 100 _ bmdOwner = (*bmdOwnerPrx)(nil) 101 _ bmdOwner = (*bmdOwnerTgt)(nil) 102 ) 103 104 var bmdImmSize int64 105 106 // c-tor 107 func newBucketMD() *bucketMD { 108 providers := make(meta.Providers, 2) 109 namespaces := make(meta.Namespaces, 1) 110 providers[apc.AIS] = namespaces 111 buckets := make(meta.Buckets, 16) 112 debug.Assert(cmn.NsGlobalUname == cmn.NsGlobal.Uname()) 113 namespaces[cmn.NsGlobalUname] = buckets 114 115 return &bucketMD{BMD: meta.BMD{Providers: providers, UUID: ""}} 116 } 117 118 func newClusterUUID() (uuid, created string) { 119 return cos.GenUUID(), time.Now().String() 120 } 121 122 ////////////// 123 // bucketMD // 124 ////////////// 125 126 func (m *bucketMD) add(bck *meta.Bck, p *cmn.Bprops) bool { 127 debug.Assert(apc.IsProvider(bck.Provider)) 128 if _, present := m.Get(bck); present { 129 return false 130 } 131 132 if m.Version == 0 { 133 m.Version = 1 // on-the-fly (e.g. via PUT remote) w/ brand-new cluster 134 } 135 p.SetProvider(bck.Provider) 136 p.BID = bck.MaskBID(m.Version) 137 p.Created = time.Now().UnixNano() 138 bck.Props = p 139 140 m.Add(bck) 141 m.Version++ 142 143 return true 144 } 145 146 func (m *bucketMD) del(bck *meta.Bck) (deleted bool) { 147 if !m.Del(bck) { 148 return 149 } 150 m.Version++ 151 return true 152 } 153 154 func (m *bucketMD) set(bck *meta.Bck, p *cmn.Bprops) { 155 debug.Assert(apc.IsProvider(bck.Provider)) 156 prevProps, present := m.Get(bck) 157 if !present { 158 debug.Assertf(false, "%s: not present", bck) 159 } 160 debug.Assert(prevProps.BID != 0) 161 162 p.SetProvider(bck.Provider) 163 p.BID = prevProps.BID 164 165 // make sure bck.backend, if exists, references backend's own props in the BMD 166 if p.BackendBck.Name != "" && p.BackendBck.Props == nil { 167 if provider, err := cmn.NormalizeProvider(p.BackendBck.Provider); err == nil { 168 p.BackendBck.Provider = provider 169 p.BackendBck.Props, _ = m.Get((*meta.Bck)(&p.BackendBck)) 170 } 171 } 172 173 m.Set(bck, p) 174 175 m.Version++ 176 } 177 178 func (m *bucketMD) clone() *bucketMD { 179 dst := &bucketMD{} 180 181 // deep copy 182 *dst = *m 183 dst.Providers = make(meta.Providers, len(m.Providers)) 184 for provider, namespaces := range m.Providers { 185 dstNamespaces := make(meta.Namespaces, len(namespaces)) 186 for ns, buckets := range namespaces { 187 dstBuckets := make(meta.Buckets, len(buckets)) 188 for name, p := range buckets { 189 dstProps := &cmn.Bprops{} 190 *dstProps = *p 191 dstBuckets[name] = dstProps 192 } 193 dstNamespaces[ns] = dstBuckets 194 } 195 dst.Providers[provider] = dstNamespaces 196 } 197 198 dst.vstr = m.vstr 199 dst._sgl = nil 200 return dst 201 } 202 203 func (m *bucketMD) validateUUID(nbmd *bucketMD, si, nsi *meta.Snode, caller string) (err error) { 204 if nbmd == nil || nbmd.Version == 0 || m.Version == 0 { 205 return 206 } 207 if !cos.IsValidUUID(m.UUID) || !cos.IsValidUUID(nbmd.UUID) { 208 return 209 } 210 if m.UUID == nbmd.UUID { 211 return 212 } 213 nsiname := caller 214 if nsi != nil { 215 nsiname = nsi.StringEx() 216 } else if nsiname == "" { 217 nsiname = "???" 218 } 219 hname := si.Name() 220 // FATAL: cluster integrity error (cie) 221 s := fmt.Sprintf("%s: BMDs have different UUIDs: (%s, %s) vs (%s, %s)", 222 ciError(40), hname, m.StringEx(), nsiname, nbmd.StringEx()) 223 err = &errPrxBmdUUIDDiffer{s} 224 return 225 } 226 227 // as revs 228 func (*bucketMD) tag() string { return revsBMDTag } 229 func (m *bucketMD) version() int64 { return m.Version } 230 func (*bucketMD) jit(p *proxy) revs { return p.owner.bmd.get() } 231 232 func (m *bucketMD) sgl() *memsys.SGL { 233 if m._sgl.IsNil() { 234 return nil 235 } 236 return m._sgl 237 } 238 239 func (m *bucketMD) marshal() []byte { 240 m._sgl = m._encode() 241 return m._sgl.Bytes() 242 } 243 244 func (m *bucketMD) _encode() (sgl *memsys.SGL) { 245 sgl = memsys.PageMM().NewSGL(bmdImmSize) 246 err := jsp.Encode(sgl, m, m.JspOpts()) 247 debug.AssertNoErr(err) 248 bmdImmSize = max(bmdImmSize, sgl.Len()) 249 return 250 } 251 252 ////////////////// 253 // bmdOwnerBase // 254 ////////////////// 255 256 func (bo *bmdOwnerBase) Get() *meta.BMD { return &bo.get().BMD } 257 func (bo *bmdOwnerBase) get() (bmd *bucketMD) { return bo.bmd.Load() } 258 259 func (bo *bmdOwnerBase) put(bmd *bucketMD) { 260 bmd.vstr = strconv.FormatInt(bmd.Version, 10) 261 bo.bmd.Store(bmd) 262 } 263 264 // write metasync-sent bytes directly (no json) 265 func (*bmdOwnerBase) persistBytes(payload msPayload, fpath string) (done bool) { 266 if payload == nil { 267 return 268 } 269 bmdValue := payload[revsBMDTag] 270 if bmdValue == nil { 271 return 272 } 273 var ( 274 bmd *meta.BMD 275 wto = cos.NewBuffer(bmdValue) 276 err = jsp.SaveMeta(fpath, bmd, wto) 277 ) 278 done = err == nil 279 return 280 } 281 282 ///////////////// 283 // bmdOwnerPrx // 284 ///////////////// 285 286 func newBMDOwnerPrx(config *cmn.Config) *bmdOwnerPrx { 287 return &bmdOwnerPrx{fpath: filepath.Join(config.ConfigDir, fname.Bmd)} 288 } 289 290 func (bo *bmdOwnerPrx) init() (prev bool) { 291 bmd, err := _loadBMD(bo.fpath) 292 if err != nil { 293 if !os.IsNotExist(err) { 294 nlog.Errorf("failed to load %s from %s, err: %v", bmd, bo.fpath, err) 295 } else { 296 nlog.Infof("%s does not exist at %s - initializing", bmd, bo.fpath) 297 } 298 } 299 bo.put(bmd) 300 return 301 } 302 303 func (bo *bmdOwnerPrx) putPersist(bmd *bucketMD, payload msPayload) (err error) { 304 if !bo.persistBytes(payload, bo.fpath) { 305 debug.Assert(bmd._sgl == nil) 306 bmd._sgl = bmd._encode() 307 err = jsp.SaveMeta(bo.fpath, bmd, bmd._sgl) 308 if err != nil { 309 bmd._sgl.Free() 310 bmd._sgl = nil 311 } 312 } 313 if err == nil { 314 bo.put(bmd) 315 } 316 return 317 } 318 319 func (*bmdOwnerPrx) persist(_ *bucketMD, _ msPayload) (err error) { debug.Assert(false); return } 320 321 // under lock 322 func (bo *bmdOwnerPrx) _pre(ctx *bmdModifier) (clone *bucketMD, err error) { 323 clone = bo.get().clone() 324 if err = ctx.pre(ctx, clone); err != nil || ctx.terminate { 325 return 326 } 327 err = bo.putPersist(clone, nil) 328 return 329 } 330 331 func (bo *bmdOwnerPrx) modify(ctx *bmdModifier) (clone *bucketMD, err error) { 332 bo.Lock() 333 clone, err = bo._pre(ctx) 334 bo.Unlock() 335 if err != nil || ctx.terminate { 336 if clone._sgl != nil { 337 clone._sgl.Free() 338 clone._sgl = nil 339 } 340 return 341 } 342 if ctx.final != nil { 343 ctx.final(ctx, clone) 344 } else if clone._sgl != nil { 345 clone._sgl.Free() 346 clone._sgl = nil 347 } 348 return 349 } 350 351 ///////////////// 352 // bmdOwnerTgt // 353 ///////////////// 354 355 func newBMDOwnerTgt() *bmdOwnerTgt { 356 return &bmdOwnerTgt{} 357 } 358 359 func (bo *bmdOwnerTgt) init() (prev bool) { 360 var ( 361 bmd *bucketMD 362 available = fs.GetAvail() 363 ) 364 if bmd = loadBMD(available, fname.Bmd); bmd != nil { 365 nlog.Infof("loaded %s", bmd) 366 goto finalize 367 } 368 if bmd = loadBMD(available, fname.BmdPrevious); bmd != nil { 369 nlog.Errorf("loaded previous version of the %s (%q)", bmd, fname.BmdPrevious) 370 prev = true 371 goto finalize 372 } 373 bmd = newBucketMD() 374 nlog.Warningf("initializing new %s", bmd) 375 376 finalize: 377 bo.put(bmd) 378 return 379 } 380 381 func (bo *bmdOwnerTgt) putPersist(bmd *bucketMD, payload msPayload) (err error) { 382 if err = bo.persist(bmd, payload); err == nil { 383 bo.put(bmd) 384 } 385 return 386 } 387 388 func (*bmdOwnerTgt) persist(clone *bucketMD, payload msPayload) (err error) { 389 var ( 390 b []byte 391 sgl *memsys.SGL 392 ) 393 if payload != nil { 394 if bmdValue := payload[revsBMDTag]; bmdValue != nil { 395 b = bmdValue 396 } 397 } 398 if b == nil { 399 sgl = clone._encode() 400 defer sgl.Free() 401 } 402 cnt, availCnt := fs.PersistOnMpaths(fname.Bmd, fname.BmdPrevious, clone, bmdCopies, b, sgl) 403 if cnt > 0 { 404 return 405 } 406 if availCnt == 0 { 407 nlog.Errorf("Cannot store %s: %v", clone, cmn.ErrNoMountpaths) 408 return 409 } 410 err = fmt.Errorf("failed to store %s on any of the mountpaths (%d)", clone, availCnt) 411 nlog.Errorln(err) 412 return 413 } 414 415 func (*bmdOwnerTgt) modify(_ *bmdModifier) (*bucketMD, error) { 416 debug.Assert(false) 417 return nil, nil 418 } 419 420 func loadBMD(mpaths fs.MPI, path string) (mainBMD *bucketMD) { 421 for _, mpath := range mpaths { 422 bmd := loadBMDFromMpath(mpath, path) 423 if bmd == nil { 424 continue 425 } 426 if mainBMD == nil { 427 mainBMD = bmd 428 continue 429 } 430 if mainBMD.cksum.Equal(bmd.cksum) { 431 continue 432 } 433 if mainBMD.Version == bmd.Version { 434 cos.ExitLogf("BMD is different (%q): %v vs %v", mpath, mainBMD, bmd) 435 } 436 nlog.Errorf("Warning: detected different BMD versions (%q): %v != %v", mpath, mainBMD, bmd) 437 if mainBMD.Version < bmd.Version { 438 mainBMD = bmd 439 } 440 } 441 return 442 } 443 444 func _loadBMD(path string) (bmd *bucketMD, err error) { 445 bmd = newBucketMD() 446 bmd.cksum, err = jsp.LoadMeta(path, bmd) 447 if _, ok := err.(*jsp.ErrUnsupportedMetaVersion); ok { 448 nlog.Errorf(cmn.FmtErrBackwardCompat, err) 449 } 450 return 451 } 452 453 func loadBMDFromMpath(mpath *fs.Mountpath, path string) (bmd *bucketMD) { 454 var ( 455 fpath = filepath.Join(mpath.Path, path) 456 err error 457 ) 458 bmd, err = _loadBMD(fpath) 459 if err == nil { 460 return bmd 461 } 462 if !os.IsNotExist(err) { 463 // Should never be NotExist error as mpi should include only mpaths with relevant bmds stored. 464 nlog.Errorf("failed to load %s from %s, err: %v", bmd, fpath, err) 465 } 466 return nil 467 } 468 469 func hasEnoughBMDCopies() bool { return fs.CountPersisted(fname.Bmd) >= bmdCopies } 470 471 ////////////////////////// 472 // default bucket props // 473 ////////////////////////// 474 475 type bckPropsArgs struct { 476 bck *meta.Bck // Base bucket for determining default bucket props. 477 hdr http.Header // Header with remote bucket properties. 478 } 479 480 // Convert HEAD(bucket) response to cmn.Bprops (compare with `defaultBckProps`) 481 func remoteBckProps(args bckPropsArgs) (props *cmn.Bprops, err error) { 482 props = &cmn.Bprops{} 483 err = cmn.IterFields(props, func(tag string, field cmn.IterField) (error, bool) { 484 headerName := textproto.CanonicalMIMEHeaderKey(tag) 485 // skip the missing ones 486 if _, ok := args.hdr[headerName]; !ok { 487 return nil, false 488 } 489 // single-value 490 return field.SetValue(args.hdr.Get(headerName), true /*force*/), false 491 }, cmn.IterOpts{OnlyRead: false}) 492 return 493 } 494 495 // Used to initialize "local" bucket, in particular when there's a remote one 496 // (compare with `remoteBckProps` above) 497 // See also: 498 // - github.com/NVIDIA/aistore/blob/main/docs/bucket.md#default-bucket-properties 499 // - cmn.BpropsToSet 500 // - cmn.Bck.DefaultProps 501 func defaultBckProps(args bckPropsArgs) (props *cmn.Bprops) { 502 config := cmn.GCO.Get() 503 props = args.bck.Bucket().DefaultProps(&config.ClusterConfig) 504 props.SetProvider(args.bck.Provider) 505 506 switch { 507 case args.bck.IsAIS(): 508 debug.Assert(args.hdr == nil) 509 case args.bck.Backend() != nil: 510 debug.Assertf(args.hdr == nil, "%s, hdr=%+v", args.bck, args.hdr) 511 case args.bck.IsRemote(): 512 debug.Assert(args.hdr != nil) 513 props.Versioning.Enabled = false 514 props = mergeRemoteBckProps(props, args.hdr) 515 default: 516 debug.Assert(false) 517 } 518 err := props.Validate(9999 /*targetCnt*/) 519 debug.AssertNoErr(err) 520 return 521 } 522 523 func mergeRemoteBckProps(props *cmn.Bprops, header http.Header) *cmn.Bprops { 524 debug.Assert(len(header) > 0) 525 switch props.Provider { 526 case apc.AWS: 527 props.Extra.AWS.CloudRegion = header.Get(apc.HdrS3Region) 528 props.Extra.AWS.Endpoint = header.Get(apc.HdrS3Endpoint) 529 props.Extra.AWS.Profile = header.Get(apc.HdrS3Profile) 530 case apc.HTTP: 531 props.Extra.HTTP.OrigURLBck = header.Get(apc.HdrOrigURLBck) 532 } 533 534 if verStr := header.Get(apc.HdrBucketVerEnabled); verStr != "" { 535 versioning, err := cos.ParseBool(verStr) 536 debug.AssertNoErr(err) 537 props.Versioning.Enabled = versioning 538 } 539 return props 540 } 541 542 // returns (uname, nlc) pair to lock/unlock buckets 543 func newBckNLP(b *meta.Bck) core.NLP { return core.NewNLP(b.MakeUname("")) }