github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/objstorage/objstorageprovider/remoteobjcat/catalog.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package remoteobjcat
     6  
     7  import (
     8  	"cmp"
     9  	"fmt"
    10  	"io"
    11  	"slices"
    12  	"sync"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/pebble/internal/base"
    16  	"github.com/cockroachdb/pebble/objstorage"
    17  	"github.com/cockroachdb/pebble/objstorage/remote"
    18  	"github.com/cockroachdb/pebble/record"
    19  	"github.com/cockroachdb/pebble/vfs"
    20  	"github.com/cockroachdb/pebble/vfs/atomicfs"
    21  )
    22  
    23  // Catalog is used to manage the on-disk remote object catalog.
    24  //
    25  // The catalog file is a log of records, where each record is an encoded
    26  // VersionEdit.
    27  type Catalog struct {
    28  	fs      vfs.FS
    29  	dirname string
    30  	mu      struct {
    31  		sync.Mutex
    32  
    33  		creatorID objstorage.CreatorID
    34  		objects   map[base.DiskFileNum]RemoteObjectMetadata
    35  
    36  		marker *atomicfs.Marker
    37  
    38  		catalogFile      vfs.File
    39  		catalogRecWriter *record.Writer
    40  
    41  		rotationHelper record.RotationHelper
    42  
    43  		// catalogFilename is the filename of catalogFile when catalogFile != nil, otherwise
    44  		// it is the filename of the last catalog file.
    45  		catalogFilename string
    46  	}
    47  }
    48  
    49  // RemoteObjectMetadata encapsulates the data stored in the catalog file for each object.
    50  type RemoteObjectMetadata struct {
    51  	// FileNum is the identifier for the object within the context of a single DB
    52  	// instance.
    53  	FileNum base.DiskFileNum
    54  	// FileType is the type of the object. Only certain FileTypes are possible.
    55  	FileType base.FileType
    56  	// CreatorID identifies the DB instance that originally created the object.
    57  	CreatorID objstorage.CreatorID
    58  	// CreatorFileNum is the identifier for the object within the context of the
    59  	// DB instance that originally created the object.
    60  	CreatorFileNum base.DiskFileNum
    61  	// CleanupMethod indicates the method for cleaning up unused shared objects.
    62  	CleanupMethod objstorage.SharedCleanupMethod
    63  	// Locator identifies a remote.Storage implementation.
    64  	Locator remote.Locator
    65  	// CustomObjectName (if it is set) overrides the object name that is normally
    66  	// derived from the CreatorID and CreatorFileNum.
    67  	CustomObjectName string
    68  }
    69  
    70  const (
    71  	catalogFilenameBase = "REMOTE-OBJ-CATALOG"
    72  	catalogMarkerName   = "remote-obj-catalog"
    73  
    74  	// We create a new file when the size exceeds 1MB (and some other conditions
    75  	// hold; see record.RotationHelper).
    76  	rotateFileSize = 1024 * 1024 // 1MB
    77  )
    78  
    79  // CatalogContents contains the remote objects in the catalog.
    80  type CatalogContents struct {
    81  	// CreatorID, if it is set.
    82  	CreatorID objstorage.CreatorID
    83  	Objects   []RemoteObjectMetadata
    84  }
    85  
    86  // Open creates a Catalog and loads any existing catalog file, returning the
    87  // creator ID (if it is set) and the contents.
    88  func Open(fs vfs.FS, dirname string) (*Catalog, CatalogContents, error) {
    89  	c := &Catalog{
    90  		fs:      fs,
    91  		dirname: dirname,
    92  	}
    93  	c.mu.objects = make(map[base.DiskFileNum]RemoteObjectMetadata)
    94  
    95  	var err error
    96  	c.mu.marker, c.mu.catalogFilename, err = atomicfs.LocateMarker(fs, dirname, catalogMarkerName)
    97  	if err != nil {
    98  		return nil, CatalogContents{}, err
    99  	}
   100  	// If the filename is empty, there is no existing catalog.
   101  	if c.mu.catalogFilename != "" {
   102  		if err := c.loadFromCatalogFile(c.mu.catalogFilename); err != nil {
   103  			return nil, CatalogContents{}, err
   104  		}
   105  		if err := c.mu.marker.RemoveObsolete(); err != nil {
   106  			return nil, CatalogContents{}, err
   107  		}
   108  		// TODO(radu): remove obsolete catalog files.
   109  	}
   110  	res := CatalogContents{
   111  		CreatorID: c.mu.creatorID,
   112  		Objects:   make([]RemoteObjectMetadata, 0, len(c.mu.objects)),
   113  	}
   114  	for _, meta := range c.mu.objects {
   115  		res.Objects = append(res.Objects, meta)
   116  	}
   117  	// Sort the objects so the function is deterministic.
   118  	slices.SortFunc(res.Objects, func(a, b RemoteObjectMetadata) int {
   119  		return cmp.Compare(a.FileNum, b.FileNum)
   120  	})
   121  	return c, res, nil
   122  }
   123  
   124  // SetCreatorID sets the creator ID. If it is already set, it must match.
   125  func (c *Catalog) SetCreatorID(id objstorage.CreatorID) error {
   126  	if !id.IsSet() {
   127  		return errors.AssertionFailedf("attempt to unset CreatorID")
   128  	}
   129  
   130  	c.mu.Lock()
   131  	defer c.mu.Unlock()
   132  
   133  	if c.mu.creatorID.IsSet() {
   134  		if c.mu.creatorID != id {
   135  			return errors.AssertionFailedf("attempt to change CreatorID from %s to %s", c.mu.creatorID, id)
   136  		}
   137  		return nil
   138  	}
   139  
   140  	ve := VersionEdit{CreatorID: id}
   141  	if err := c.writeToCatalogFileLocked(&ve); err != nil {
   142  		return errors.Wrapf(err, "pebble: could not write to remote object catalog")
   143  	}
   144  	c.mu.creatorID = id
   145  	return nil
   146  }
   147  
   148  // Close any open files.
   149  func (c *Catalog) Close() error {
   150  	return c.closeCatalogFile()
   151  }
   152  
   153  func (c *Catalog) closeCatalogFile() error {
   154  	if c.mu.catalogFile == nil {
   155  		return nil
   156  	}
   157  	err1 := c.mu.catalogRecWriter.Close()
   158  	err2 := c.mu.catalogFile.Close()
   159  	c.mu.catalogRecWriter = nil
   160  	c.mu.catalogFile = nil
   161  	if err1 != nil {
   162  		return err1
   163  	}
   164  	return err2
   165  }
   166  
   167  // Batch is used to perform multiple object additions/deletions at once.
   168  type Batch struct {
   169  	ve VersionEdit
   170  }
   171  
   172  // AddObject adds a new object to the batch.
   173  //
   174  // The given FileNum must be new - it must not match that of any object that was
   175  // ever in the catalog.
   176  func (b *Batch) AddObject(meta RemoteObjectMetadata) {
   177  	b.ve.NewObjects = append(b.ve.NewObjects, meta)
   178  }
   179  
   180  // DeleteObject adds an object removal to the batch.
   181  func (b *Batch) DeleteObject(fileNum base.DiskFileNum) {
   182  	b.ve.DeletedObjects = append(b.ve.DeletedObjects, fileNum)
   183  }
   184  
   185  // Reset clears the batch.
   186  func (b *Batch) Reset() {
   187  	b.ve.NewObjects = b.ve.NewObjects[:0]
   188  	b.ve.DeletedObjects = b.ve.DeletedObjects[:0]
   189  }
   190  
   191  // IsEmpty returns true if the batch is empty.
   192  func (b *Batch) IsEmpty() bool {
   193  	return len(b.ve.NewObjects) == 0 && len(b.ve.DeletedObjects) == 0
   194  }
   195  
   196  // Copy returns a copy of the Batch.
   197  func (b *Batch) Copy() Batch {
   198  	var res Batch
   199  	if len(b.ve.NewObjects) > 0 {
   200  		res.ve.NewObjects = make([]RemoteObjectMetadata, len(b.ve.NewObjects))
   201  		copy(res.ve.NewObjects, b.ve.NewObjects)
   202  	}
   203  	if len(b.ve.DeletedObjects) > 0 {
   204  		res.ve.DeletedObjects = make([]base.DiskFileNum, len(b.ve.DeletedObjects))
   205  		copy(res.ve.DeletedObjects, b.ve.DeletedObjects)
   206  	}
   207  	return res
   208  }
   209  
   210  // Append merges two batches.
   211  func (b *Batch) Append(other Batch) {
   212  	b.ve.NewObjects = append(b.ve.NewObjects, other.ve.NewObjects...)
   213  	b.ve.DeletedObjects = append(b.ve.DeletedObjects, other.ve.DeletedObjects...)
   214  }
   215  
   216  // ApplyBatch applies a batch of updates; returns after the change is stably
   217  // recorded on storage.
   218  func (c *Catalog) ApplyBatch(b Batch) error {
   219  	c.mu.Lock()
   220  	defer c.mu.Unlock()
   221  
   222  	// Sanity checks.
   223  	toAdd := make(map[base.DiskFileNum]struct{}, len(b.ve.NewObjects))
   224  	exists := func(n base.DiskFileNum) bool {
   225  		_, ok := c.mu.objects[n]
   226  		if !ok {
   227  			_, ok = toAdd[n]
   228  		}
   229  		return ok
   230  	}
   231  	for _, meta := range b.ve.NewObjects {
   232  		if exists(meta.FileNum) {
   233  			return errors.AssertionFailedf("adding existing object %s", meta.FileNum)
   234  		}
   235  		toAdd[meta.FileNum] = struct{}{}
   236  	}
   237  	for _, n := range b.ve.DeletedObjects {
   238  		if !exists(n) {
   239  			return errors.AssertionFailedf("deleting non-existent object %s", n)
   240  		}
   241  	}
   242  
   243  	if err := c.writeToCatalogFileLocked(&b.ve); err != nil {
   244  		return errors.Wrapf(err, "pebble: could not write to remote object catalog")
   245  	}
   246  
   247  	// Add new objects before deleting any objects. This allows for cases where
   248  	// the same batch adds and deletes an object.
   249  	for _, meta := range b.ve.NewObjects {
   250  		c.mu.objects[meta.FileNum] = meta
   251  	}
   252  	for _, n := range b.ve.DeletedObjects {
   253  		delete(c.mu.objects, n)
   254  	}
   255  
   256  	return nil
   257  }
   258  
   259  func (c *Catalog) loadFromCatalogFile(filename string) error {
   260  	catalogPath := c.fs.PathJoin(c.dirname, filename)
   261  	f, err := c.fs.Open(catalogPath)
   262  	if err != nil {
   263  		return errors.Wrapf(
   264  			err, "pebble: could not open remote object catalog file %q for DB %q",
   265  			errors.Safe(filename), c.dirname,
   266  		)
   267  	}
   268  	defer f.Close()
   269  	rr := record.NewReader(f, 0 /* logNum */)
   270  	for {
   271  		r, err := rr.Next()
   272  		if err == io.EOF || record.IsInvalidRecord(err) {
   273  			break
   274  		}
   275  		if err != nil {
   276  			return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q",
   277  				errors.Safe(filename))
   278  		}
   279  		var ve VersionEdit
   280  		if err := ve.Decode(r); err != nil {
   281  			return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q",
   282  				errors.Safe(filename))
   283  		}
   284  		// Apply the version edit to the current state.
   285  		if err := ve.Apply(&c.mu.creatorID, c.mu.objects); err != nil {
   286  			return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q",
   287  				errors.Safe(filename))
   288  		}
   289  	}
   290  	return nil
   291  }
   292  
   293  // writeToCatalogFileLocked writes a VersionEdit to the catalog file.
   294  // Creates a new file if this is the first write.
   295  func (c *Catalog) writeToCatalogFileLocked(ve *VersionEdit) error {
   296  	c.mu.rotationHelper.AddRecord(int64(len(ve.NewObjects) + len(ve.DeletedObjects)))
   297  	snapshotSize := int64(len(c.mu.objects))
   298  
   299  	var shouldRotate bool
   300  	if c.mu.catalogFile == nil {
   301  		shouldRotate = true
   302  	} else if c.mu.catalogRecWriter.Size() >= rotateFileSize {
   303  		shouldRotate = c.mu.rotationHelper.ShouldRotate(snapshotSize)
   304  	}
   305  
   306  	if shouldRotate {
   307  		if c.mu.catalogFile != nil {
   308  			if err := c.closeCatalogFile(); err != nil {
   309  				return err
   310  			}
   311  		}
   312  		if err := c.createNewCatalogFileLocked(); err != nil {
   313  			return err
   314  		}
   315  		c.mu.rotationHelper.Rotate(snapshotSize)
   316  	}
   317  	return writeRecord(ve, c.mu.catalogFile, c.mu.catalogRecWriter)
   318  }
   319  
   320  func makeCatalogFilename(iter uint64) string {
   321  	return fmt.Sprintf("%s-%06d", catalogFilenameBase, iter)
   322  }
   323  
   324  // createNewCatalogFileLocked creates a new catalog file, populates it with the
   325  // current catalog and sets c.mu.catalogFile and c.mu.catalogRecWriter.
   326  func (c *Catalog) createNewCatalogFileLocked() (outErr error) {
   327  	if c.mu.catalogFile != nil {
   328  		return errors.AssertionFailedf("catalogFile already open")
   329  	}
   330  	filename := makeCatalogFilename(c.mu.marker.NextIter())
   331  	filepath := c.fs.PathJoin(c.dirname, filename)
   332  	file, err := c.fs.Create(filepath)
   333  	if err != nil {
   334  		return err
   335  	}
   336  	recWriter := record.NewWriter(file)
   337  	err = func() error {
   338  		// Create a VersionEdit that gets us from an empty catalog to the current state.
   339  		var ve VersionEdit
   340  		ve.CreatorID = c.mu.creatorID
   341  		ve.NewObjects = make([]RemoteObjectMetadata, 0, len(c.mu.objects))
   342  		for _, meta := range c.mu.objects {
   343  			ve.NewObjects = append(ve.NewObjects, meta)
   344  		}
   345  		if err := writeRecord(&ve, file, recWriter); err != nil {
   346  			return err
   347  		}
   348  
   349  		// Move the marker to the new filename. Move handles syncing the data
   350  		// directory as well.
   351  		if err := c.mu.marker.Move(filename); err != nil {
   352  			return errors.Wrap(err, "moving marker")
   353  		}
   354  
   355  		return nil
   356  	}()
   357  
   358  	if err != nil {
   359  		_ = recWriter.Close()
   360  		_ = file.Close()
   361  		_ = c.fs.Remove(filepath)
   362  		return err
   363  	}
   364  
   365  	// Remove any previous file (ignoring any error).
   366  	if c.mu.catalogFilename != "" {
   367  		_ = c.fs.Remove(c.fs.PathJoin(c.dirname, c.mu.catalogFilename))
   368  	}
   369  
   370  	c.mu.catalogFile = file
   371  	c.mu.catalogRecWriter = recWriter
   372  	c.mu.catalogFilename = filename
   373  	return nil
   374  }
   375  
   376  func writeRecord(ve *VersionEdit, file vfs.File, recWriter *record.Writer) error {
   377  	w, err := recWriter.Next()
   378  	if err != nil {
   379  		return err
   380  	}
   381  	if err := ve.Encode(w); err != nil {
   382  		return err
   383  	}
   384  	if err := recWriter.Flush(); err != nil {
   385  		return err
   386  	}
   387  	return file.Sync()
   388  }