github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/bucketmeta.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"fmt"
     9  	"net/http"
    10  	"net/textproto"
    11  	"os"
    12  	"path/filepath"
    13  	"strconv"
    14  	"sync"
    15  	ratomic "sync/atomic"
    16  	"time"
    17  
    18  	"github.com/NVIDIA/aistore/api/apc"
    19  	"github.com/NVIDIA/aistore/cmn"
    20  	"github.com/NVIDIA/aistore/cmn/cos"
    21  	"github.com/NVIDIA/aistore/cmn/debug"
    22  	"github.com/NVIDIA/aistore/cmn/fname"
    23  	"github.com/NVIDIA/aistore/cmn/jsp"
    24  	"github.com/NVIDIA/aistore/cmn/nlog"
    25  	"github.com/NVIDIA/aistore/core"
    26  	"github.com/NVIDIA/aistore/core/meta"
    27  	"github.com/NVIDIA/aistore/fs"
    28  	"github.com/NVIDIA/aistore/memsys"
    29  )
    30  
    31  // NOTE: to access bucket metadata and related structures, external
    32  //       packages and HTTP clients must import aistore/cluster (and not ais)
    33  
    34  // - bucketMD is a server-side extension of the meta.BMD
    35  // - bucketMD represents buckets (that store objects) and associated metadata
    36  // - bucketMD (instance) can be obtained via bmdOwner.get()
    37  // - bucketMD is immutable and versioned
    38  // - bucketMD versioning is monotonic and incremental
    39  //
    40  // - bucketMD typical update transaction:
    41  // lock -- clone() -- modify the clone -- bmdOwner.put(clone) -- unlock
    42  //
    43  // (*) for merges and conflict resolution, check the current version prior to put()
    44  //     (note that version check must be protected by the same critical section)
    45  //
    46  
    47  const bmdCopies = 2 // local copies
    48  
    49  type (
    50  	bucketMD struct {
    51  		cksum *cos.Cksum  // BMD checksum
    52  		_sgl  *memsys.SGL // jsp-formatted
    53  		vstr  string      // itoa(Version), to have it handy for http redirects
    54  		meta.BMD
    55  	}
    56  	bmdOwner interface {
    57  		sync.Locker
    58  		Get() *meta.BMD
    59  
    60  		init() bool // true when loaded previous version
    61  		get() (bmd *bucketMD)
    62  		putPersist(bmd *bucketMD, payload msPayload) error
    63  		persist(clone *bucketMD, payload msPayload) error
    64  		modify(*bmdModifier) (*bucketMD, error)
    65  	}
    66  	bmdOwnerBase struct {
    67  		bmd ratomic.Pointer[bucketMD]
    68  		sync.Mutex
    69  	}
    70  	bmdOwnerPrx struct {
    71  		bmdOwnerBase
    72  		fpath string
    73  	}
    74  	bmdOwnerTgt struct{ bmdOwnerBase }
    75  
    76  	bmdModifier struct {
    77  		pre   func(*bmdModifier, *bucketMD) error
    78  		final func(*bmdModifier, *bucketMD)
    79  
    80  		msg   *apc.ActMsg
    81  		txnID string // transaction UUID
    82  		bcks  []*meta.Bck
    83  
    84  		propsToUpdate *cmn.BpropsToSet // update existing props
    85  		revertProps   *cmn.BpropsToSet // props to revert
    86  		setProps      *cmn.Bprops      // new props to set
    87  
    88  		wait         bool
    89  		needReMirror bool
    90  		needReEC     bool
    91  		terminate    bool
    92  		singleTarget bool
    93  	}
    94  )
    95  
    96  // interface guard
    97  var (
    98  	_ revs        = (*bucketMD)(nil)
    99  	_ meta.Bowner = (*bmdOwnerBase)(nil)
   100  	_ bmdOwner    = (*bmdOwnerPrx)(nil)
   101  	_ bmdOwner    = (*bmdOwnerTgt)(nil)
   102  )
   103  
   104  var bmdImmSize int64
   105  
   106  // c-tor
   107  func newBucketMD() *bucketMD {
   108  	providers := make(meta.Providers, 2)
   109  	namespaces := make(meta.Namespaces, 1)
   110  	providers[apc.AIS] = namespaces
   111  	buckets := make(meta.Buckets, 16)
   112  	debug.Assert(cmn.NsGlobalUname == cmn.NsGlobal.Uname())
   113  	namespaces[cmn.NsGlobalUname] = buckets
   114  
   115  	return &bucketMD{BMD: meta.BMD{Providers: providers, UUID: ""}}
   116  }
   117  
   118  func newClusterUUID() (uuid, created string) {
   119  	return cos.GenUUID(), time.Now().String()
   120  }
   121  
   122  //////////////
   123  // bucketMD //
   124  //////////////
   125  
   126  func (m *bucketMD) add(bck *meta.Bck, p *cmn.Bprops) bool {
   127  	debug.Assert(apc.IsProvider(bck.Provider))
   128  	if _, present := m.Get(bck); present {
   129  		return false
   130  	}
   131  
   132  	if m.Version == 0 {
   133  		m.Version = 1 // on-the-fly (e.g. via PUT remote) w/ brand-new cluster
   134  	}
   135  	p.SetProvider(bck.Provider)
   136  	p.BID = bck.MaskBID(m.Version)
   137  	p.Created = time.Now().UnixNano()
   138  	bck.Props = p
   139  
   140  	m.Add(bck)
   141  	m.Version++
   142  
   143  	return true
   144  }
   145  
   146  func (m *bucketMD) del(bck *meta.Bck) (deleted bool) {
   147  	if !m.Del(bck) {
   148  		return
   149  	}
   150  	m.Version++
   151  	return true
   152  }
   153  
   154  func (m *bucketMD) set(bck *meta.Bck, p *cmn.Bprops) {
   155  	debug.Assert(apc.IsProvider(bck.Provider))
   156  	prevProps, present := m.Get(bck)
   157  	if !present {
   158  		debug.Assertf(false, "%s: not present", bck)
   159  	}
   160  	debug.Assert(prevProps.BID != 0)
   161  
   162  	p.SetProvider(bck.Provider)
   163  	p.BID = prevProps.BID
   164  
   165  	// make sure bck.backend, if exists, references backend's own props in the BMD
   166  	if p.BackendBck.Name != "" && p.BackendBck.Props == nil {
   167  		if provider, err := cmn.NormalizeProvider(p.BackendBck.Provider); err == nil {
   168  			p.BackendBck.Provider = provider
   169  			p.BackendBck.Props, _ = m.Get((*meta.Bck)(&p.BackendBck))
   170  		}
   171  	}
   172  
   173  	m.Set(bck, p)
   174  
   175  	m.Version++
   176  }
   177  
   178  func (m *bucketMD) clone() *bucketMD {
   179  	dst := &bucketMD{}
   180  
   181  	// deep copy
   182  	*dst = *m
   183  	dst.Providers = make(meta.Providers, len(m.Providers))
   184  	for provider, namespaces := range m.Providers {
   185  		dstNamespaces := make(meta.Namespaces, len(namespaces))
   186  		for ns, buckets := range namespaces {
   187  			dstBuckets := make(meta.Buckets, len(buckets))
   188  			for name, p := range buckets {
   189  				dstProps := &cmn.Bprops{}
   190  				*dstProps = *p
   191  				dstBuckets[name] = dstProps
   192  			}
   193  			dstNamespaces[ns] = dstBuckets
   194  		}
   195  		dst.Providers[provider] = dstNamespaces
   196  	}
   197  
   198  	dst.vstr = m.vstr
   199  	dst._sgl = nil
   200  	return dst
   201  }
   202  
   203  func (m *bucketMD) validateUUID(nbmd *bucketMD, si, nsi *meta.Snode, caller string) (err error) {
   204  	if nbmd == nil || nbmd.Version == 0 || m.Version == 0 {
   205  		return
   206  	}
   207  	if !cos.IsValidUUID(m.UUID) || !cos.IsValidUUID(nbmd.UUID) {
   208  		return
   209  	}
   210  	if m.UUID == nbmd.UUID {
   211  		return
   212  	}
   213  	nsiname := caller
   214  	if nsi != nil {
   215  		nsiname = nsi.StringEx()
   216  	} else if nsiname == "" {
   217  		nsiname = "???"
   218  	}
   219  	hname := si.Name()
   220  	// FATAL: cluster integrity error (cie)
   221  	s := fmt.Sprintf("%s: BMDs have different UUIDs: (%s, %s) vs (%s, %s)",
   222  		ciError(40), hname, m.StringEx(), nsiname, nbmd.StringEx())
   223  	err = &errPrxBmdUUIDDiffer{s}
   224  	return
   225  }
   226  
   227  // as revs
   228  func (*bucketMD) tag() string       { return revsBMDTag }
   229  func (m *bucketMD) version() int64  { return m.Version }
   230  func (*bucketMD) jit(p *proxy) revs { return p.owner.bmd.get() }
   231  
   232  func (m *bucketMD) sgl() *memsys.SGL {
   233  	if m._sgl.IsNil() {
   234  		return nil
   235  	}
   236  	return m._sgl
   237  }
   238  
   239  func (m *bucketMD) marshal() []byte {
   240  	m._sgl = m._encode()
   241  	return m._sgl.Bytes()
   242  }
   243  
   244  func (m *bucketMD) _encode() (sgl *memsys.SGL) {
   245  	sgl = memsys.PageMM().NewSGL(bmdImmSize)
   246  	err := jsp.Encode(sgl, m, m.JspOpts())
   247  	debug.AssertNoErr(err)
   248  	bmdImmSize = max(bmdImmSize, sgl.Len())
   249  	return
   250  }
   251  
   252  //////////////////
   253  // bmdOwnerBase //
   254  //////////////////
   255  
   256  func (bo *bmdOwnerBase) Get() *meta.BMD       { return &bo.get().BMD }
   257  func (bo *bmdOwnerBase) get() (bmd *bucketMD) { return bo.bmd.Load() }
   258  
   259  func (bo *bmdOwnerBase) put(bmd *bucketMD) {
   260  	bmd.vstr = strconv.FormatInt(bmd.Version, 10)
   261  	bo.bmd.Store(bmd)
   262  }
   263  
   264  // write metasync-sent bytes directly (no json)
   265  func (*bmdOwnerBase) persistBytes(payload msPayload, fpath string) (done bool) {
   266  	if payload == nil {
   267  		return
   268  	}
   269  	bmdValue := payload[revsBMDTag]
   270  	if bmdValue == nil {
   271  		return
   272  	}
   273  	var (
   274  		bmd *meta.BMD
   275  		wto = cos.NewBuffer(bmdValue)
   276  		err = jsp.SaveMeta(fpath, bmd, wto)
   277  	)
   278  	done = err == nil
   279  	return
   280  }
   281  
   282  /////////////////
   283  // bmdOwnerPrx //
   284  /////////////////
   285  
   286  func newBMDOwnerPrx(config *cmn.Config) *bmdOwnerPrx {
   287  	return &bmdOwnerPrx{fpath: filepath.Join(config.ConfigDir, fname.Bmd)}
   288  }
   289  
   290  func (bo *bmdOwnerPrx) init() (prev bool) {
   291  	bmd, err := _loadBMD(bo.fpath)
   292  	if err != nil {
   293  		if !os.IsNotExist(err) {
   294  			nlog.Errorf("failed to load %s from %s, err: %v", bmd, bo.fpath, err)
   295  		} else {
   296  			nlog.Infof("%s does not exist at %s - initializing", bmd, bo.fpath)
   297  		}
   298  	}
   299  	bo.put(bmd)
   300  	return
   301  }
   302  
   303  func (bo *bmdOwnerPrx) putPersist(bmd *bucketMD, payload msPayload) (err error) {
   304  	if !bo.persistBytes(payload, bo.fpath) {
   305  		debug.Assert(bmd._sgl == nil)
   306  		bmd._sgl = bmd._encode()
   307  		err = jsp.SaveMeta(bo.fpath, bmd, bmd._sgl)
   308  		if err != nil {
   309  			bmd._sgl.Free()
   310  			bmd._sgl = nil
   311  		}
   312  	}
   313  	if err == nil {
   314  		bo.put(bmd)
   315  	}
   316  	return
   317  }
   318  
   319  func (*bmdOwnerPrx) persist(_ *bucketMD, _ msPayload) (err error) { debug.Assert(false); return }
   320  
   321  // under lock
   322  func (bo *bmdOwnerPrx) _pre(ctx *bmdModifier) (clone *bucketMD, err error) {
   323  	clone = bo.get().clone()
   324  	if err = ctx.pre(ctx, clone); err != nil || ctx.terminate {
   325  		return
   326  	}
   327  	err = bo.putPersist(clone, nil)
   328  	return
   329  }
   330  
   331  func (bo *bmdOwnerPrx) modify(ctx *bmdModifier) (clone *bucketMD, err error) {
   332  	bo.Lock()
   333  	clone, err = bo._pre(ctx)
   334  	bo.Unlock()
   335  	if err != nil || ctx.terminate {
   336  		if clone._sgl != nil {
   337  			clone._sgl.Free()
   338  			clone._sgl = nil
   339  		}
   340  		return
   341  	}
   342  	if ctx.final != nil {
   343  		ctx.final(ctx, clone)
   344  	} else if clone._sgl != nil {
   345  		clone._sgl.Free()
   346  		clone._sgl = nil
   347  	}
   348  	return
   349  }
   350  
   351  /////////////////
   352  // bmdOwnerTgt //
   353  /////////////////
   354  
   355  func newBMDOwnerTgt() *bmdOwnerTgt {
   356  	return &bmdOwnerTgt{}
   357  }
   358  
   359  func (bo *bmdOwnerTgt) init() (prev bool) {
   360  	var (
   361  		bmd       *bucketMD
   362  		available = fs.GetAvail()
   363  	)
   364  	if bmd = loadBMD(available, fname.Bmd); bmd != nil {
   365  		nlog.Infof("loaded %s", bmd)
   366  		goto finalize
   367  	}
   368  	if bmd = loadBMD(available, fname.BmdPrevious); bmd != nil {
   369  		nlog.Errorf("loaded previous version of the %s (%q)", bmd, fname.BmdPrevious)
   370  		prev = true
   371  		goto finalize
   372  	}
   373  	bmd = newBucketMD()
   374  	nlog.Warningf("initializing new %s", bmd)
   375  
   376  finalize:
   377  	bo.put(bmd)
   378  	return
   379  }
   380  
   381  func (bo *bmdOwnerTgt) putPersist(bmd *bucketMD, payload msPayload) (err error) {
   382  	if err = bo.persist(bmd, payload); err == nil {
   383  		bo.put(bmd)
   384  	}
   385  	return
   386  }
   387  
   388  func (*bmdOwnerTgt) persist(clone *bucketMD, payload msPayload) (err error) {
   389  	var (
   390  		b   []byte
   391  		sgl *memsys.SGL
   392  	)
   393  	if payload != nil {
   394  		if bmdValue := payload[revsBMDTag]; bmdValue != nil {
   395  			b = bmdValue
   396  		}
   397  	}
   398  	if b == nil {
   399  		sgl = clone._encode()
   400  		defer sgl.Free()
   401  	}
   402  	cnt, availCnt := fs.PersistOnMpaths(fname.Bmd, fname.BmdPrevious, clone, bmdCopies, b, sgl)
   403  	if cnt > 0 {
   404  		return
   405  	}
   406  	if availCnt == 0 {
   407  		nlog.Errorf("Cannot store %s: %v", clone, cmn.ErrNoMountpaths)
   408  		return
   409  	}
   410  	err = fmt.Errorf("failed to store %s on any of the mountpaths (%d)", clone, availCnt)
   411  	nlog.Errorln(err)
   412  	return
   413  }
   414  
   415  func (*bmdOwnerTgt) modify(_ *bmdModifier) (*bucketMD, error) {
   416  	debug.Assert(false)
   417  	return nil, nil
   418  }
   419  
   420  func loadBMD(mpaths fs.MPI, path string) (mainBMD *bucketMD) {
   421  	for _, mpath := range mpaths {
   422  		bmd := loadBMDFromMpath(mpath, path)
   423  		if bmd == nil {
   424  			continue
   425  		}
   426  		if mainBMD == nil {
   427  			mainBMD = bmd
   428  			continue
   429  		}
   430  		if mainBMD.cksum.Equal(bmd.cksum) {
   431  			continue
   432  		}
   433  		if mainBMD.Version == bmd.Version {
   434  			cos.ExitLogf("BMD is different (%q): %v vs %v", mpath, mainBMD, bmd)
   435  		}
   436  		nlog.Errorf("Warning: detected different BMD versions (%q): %v != %v", mpath, mainBMD, bmd)
   437  		if mainBMD.Version < bmd.Version {
   438  			mainBMD = bmd
   439  		}
   440  	}
   441  	return
   442  }
   443  
   444  func _loadBMD(path string) (bmd *bucketMD, err error) {
   445  	bmd = newBucketMD()
   446  	bmd.cksum, err = jsp.LoadMeta(path, bmd)
   447  	if _, ok := err.(*jsp.ErrUnsupportedMetaVersion); ok {
   448  		nlog.Errorf(cmn.FmtErrBackwardCompat, err)
   449  	}
   450  	return
   451  }
   452  
   453  func loadBMDFromMpath(mpath *fs.Mountpath, path string) (bmd *bucketMD) {
   454  	var (
   455  		fpath = filepath.Join(mpath.Path, path)
   456  		err   error
   457  	)
   458  	bmd, err = _loadBMD(fpath)
   459  	if err == nil {
   460  		return bmd
   461  	}
   462  	if !os.IsNotExist(err) {
   463  		// Should never be NotExist error as mpi should include only mpaths with relevant bmds stored.
   464  		nlog.Errorf("failed to load %s from %s, err: %v", bmd, fpath, err)
   465  	}
   466  	return nil
   467  }
   468  
   469  func hasEnoughBMDCopies() bool { return fs.CountPersisted(fname.Bmd) >= bmdCopies }
   470  
   471  //////////////////////////
   472  // default bucket props //
   473  //////////////////////////
   474  
   475  type bckPropsArgs struct {
   476  	bck *meta.Bck   // Base bucket for determining default bucket props.
   477  	hdr http.Header // Header with remote bucket properties.
   478  }
   479  
   480  // Convert HEAD(bucket) response to cmn.Bprops (compare with `defaultBckProps`)
   481  func remoteBckProps(args bckPropsArgs) (props *cmn.Bprops, err error) {
   482  	props = &cmn.Bprops{}
   483  	err = cmn.IterFields(props, func(tag string, field cmn.IterField) (error, bool) {
   484  		headerName := textproto.CanonicalMIMEHeaderKey(tag)
   485  		// skip the missing ones
   486  		if _, ok := args.hdr[headerName]; !ok {
   487  			return nil, false
   488  		}
   489  		// single-value
   490  		return field.SetValue(args.hdr.Get(headerName), true /*force*/), false
   491  	}, cmn.IterOpts{OnlyRead: false})
   492  	return
   493  }
   494  
   495  // Used to initialize "local" bucket, in particular when there's a remote one
   496  // (compare with `remoteBckProps` above)
   497  // See also:
   498  //   - github.com/NVIDIA/aistore/blob/main/docs/bucket.md#default-bucket-properties
   499  //   - cmn.BpropsToSet
   500  //   - cmn.Bck.DefaultProps
   501  func defaultBckProps(args bckPropsArgs) (props *cmn.Bprops) {
   502  	config := cmn.GCO.Get()
   503  	props = args.bck.Bucket().DefaultProps(&config.ClusterConfig)
   504  	props.SetProvider(args.bck.Provider)
   505  
   506  	switch {
   507  	case args.bck.IsAIS():
   508  		debug.Assert(args.hdr == nil)
   509  	case args.bck.Backend() != nil:
   510  		debug.Assertf(args.hdr == nil, "%s, hdr=%+v", args.bck, args.hdr)
   511  	case args.bck.IsRemote():
   512  		debug.Assert(args.hdr != nil)
   513  		props.Versioning.Enabled = false
   514  		props = mergeRemoteBckProps(props, args.hdr)
   515  	default:
   516  		debug.Assert(false)
   517  	}
   518  	err := props.Validate(9999 /*targetCnt*/)
   519  	debug.AssertNoErr(err)
   520  	return
   521  }
   522  
   523  func mergeRemoteBckProps(props *cmn.Bprops, header http.Header) *cmn.Bprops {
   524  	debug.Assert(len(header) > 0)
   525  	switch props.Provider {
   526  	case apc.AWS:
   527  		props.Extra.AWS.CloudRegion = header.Get(apc.HdrS3Region)
   528  		props.Extra.AWS.Endpoint = header.Get(apc.HdrS3Endpoint)
   529  		props.Extra.AWS.Profile = header.Get(apc.HdrS3Profile)
   530  	case apc.HTTP:
   531  		props.Extra.HTTP.OrigURLBck = header.Get(apc.HdrOrigURLBck)
   532  	}
   533  
   534  	if verStr := header.Get(apc.HdrBucketVerEnabled); verStr != "" {
   535  		versioning, err := cos.ParseBool(verStr)
   536  		debug.AssertNoErr(err)
   537  		props.Versioning.Enabled = versioning
   538  	}
   539  	return props
   540  }
   541  
   542  // returns (uname, nlc) pair to lock/unlock buckets
   543  func newBckNLP(b *meta.Bck) core.NLP { return core.NewNLP(b.MakeUname("")) }