github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/objstorage/objstorageprovider/remoteobjcat/catalog.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package remoteobjcat
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"sort"
    11  	"sync"
    12  
    13  	"github.com/cockroachdb/errors"
    14  	"github.com/cockroachdb/pebble/internal/base"
    15  	"github.com/cockroachdb/pebble/objstorage"
    16  	"github.com/cockroachdb/pebble/objstorage/remote"
    17  	"github.com/cockroachdb/pebble/record"
    18  	"github.com/cockroachdb/pebble/vfs"
    19  	"github.com/cockroachdb/pebble/vfs/atomicfs"
    20  )
    21  
    22  // Catalog is used to manage the on-disk remote object catalog.
    23  //
    24  // The catalog file is a log of records, where each record is an encoded
    25  // VersionEdit.
    26  type Catalog struct {
    27  	fs      vfs.FS
    28  	dirname string
    29  	mu      struct {
    30  		sync.Mutex
    31  
    32  		creatorID objstorage.CreatorID
    33  		objects   map[base.DiskFileNum]RemoteObjectMetadata
    34  
    35  		marker *atomicfs.Marker
    36  
    37  		catalogFile      vfs.File
    38  		catalogRecWriter *record.Writer
    39  
    40  		rotationHelper record.RotationHelper
    41  
    42  		// catalogFilename is the filename of catalogFile when catalogFile != nil, otherwise
    43  		// it is the filename of the last catalog file.
    44  		catalogFilename string
    45  	}
    46  }
    47  
    48  // RemoteObjectMetadata encapsulates the data stored in the catalog file for each object.
    49  type RemoteObjectMetadata struct {
    50  	// FileNum is the identifier for the object within the context of a single DB
    51  	// instance.
    52  	FileNum base.DiskFileNum
    53  	// FileType is the type of the object. Only certain FileTypes are possible.
    54  	FileType base.FileType
    55  	// CreatorID identifies the DB instance that originally created the object.
    56  	CreatorID objstorage.CreatorID
    57  	// CreatorFileNum is the identifier for the object within the context of the
    58  	// DB instance that originally created the object.
    59  	CreatorFileNum base.DiskFileNum
    60  	// CleanupMethod indicates the method for cleaning up unused shared objects.
    61  	CleanupMethod objstorage.SharedCleanupMethod
    62  	// Locator identifies a remote.Storage implementation.
    63  	Locator remote.Locator
    64  	// CustomObjectName (if it is set) overrides the object name that is normally
    65  	// derived from the CreatorID and CreatorFileNum.
    66  	CustomObjectName string
    67  }
    68  
    69  const (
    70  	catalogFilenameBase = "REMOTE-OBJ-CATALOG"
    71  	catalogMarkerName   = "remote-obj-catalog"
    72  
    73  	// We create a new file when the size exceeds 1MB (and some other conditions
    74  	// hold; see record.RotationHelper).
    75  	rotateFileSize = 1024 * 1024 // 1MB
    76  )
    77  
    78  // CatalogContents contains the remote objects in the catalog.
    79  type CatalogContents struct {
    80  	// CreatorID, if it is set.
    81  	CreatorID objstorage.CreatorID
    82  	Objects   []RemoteObjectMetadata
    83  }
    84  
    85  // Open creates a Catalog and loads any existing catalog file, returning the
    86  // creator ID (if it is set) and the contents.
    87  func Open(fs vfs.FS, dirname string) (*Catalog, CatalogContents, error) {
    88  	c := &Catalog{
    89  		fs:      fs,
    90  		dirname: dirname,
    91  	}
    92  	c.mu.objects = make(map[base.DiskFileNum]RemoteObjectMetadata)
    93  
    94  	var err error
    95  	c.mu.marker, c.mu.catalogFilename, err = atomicfs.LocateMarker(fs, dirname, catalogMarkerName)
    96  	if err != nil {
    97  		return nil, CatalogContents{}, err
    98  	}
    99  	// If the filename is empty, there is no existing catalog.
   100  	if c.mu.catalogFilename != "" {
   101  		if err := c.loadFromCatalogFile(c.mu.catalogFilename); err != nil {
   102  			return nil, CatalogContents{}, err
   103  		}
   104  		if err := c.mu.marker.RemoveObsolete(); err != nil {
   105  			return nil, CatalogContents{}, err
   106  		}
   107  		// TODO(radu): remove obsolete catalog files.
   108  	}
   109  	res := CatalogContents{
   110  		CreatorID: c.mu.creatorID,
   111  		Objects:   make([]RemoteObjectMetadata, 0, len(c.mu.objects)),
   112  	}
   113  	for _, meta := range c.mu.objects {
   114  		res.Objects = append(res.Objects, meta)
   115  	}
   116  	// Sort the objects so the function is deterministic.
   117  	sort.Slice(res.Objects, func(i, j int) bool {
   118  		return res.Objects[i].FileNum.FileNum() < res.Objects[j].FileNum.FileNum()
   119  	})
   120  	return c, res, nil
   121  }
   122  
   123  // SetCreatorID sets the creator ID. If it is already set, it must match.
   124  func (c *Catalog) SetCreatorID(id objstorage.CreatorID) error {
   125  	if !id.IsSet() {
   126  		return errors.AssertionFailedf("attempt to unset CreatorID")
   127  	}
   128  
   129  	c.mu.Lock()
   130  	defer c.mu.Unlock()
   131  
   132  	if c.mu.creatorID.IsSet() {
   133  		if c.mu.creatorID != id {
   134  			return errors.AssertionFailedf("attempt to change CreatorID from %s to %s", c.mu.creatorID, id)
   135  		}
   136  		return nil
   137  	}
   138  
   139  	ve := VersionEdit{CreatorID: id}
   140  	if err := c.writeToCatalogFileLocked(&ve); err != nil {
   141  		return errors.Wrapf(err, "pebble: could not write to remote object catalog: %v", err)
   142  	}
   143  	c.mu.creatorID = id
   144  	return nil
   145  }
   146  
   147  // Close any open files.
   148  func (c *Catalog) Close() error {
   149  	return c.closeCatalogFile()
   150  }
   151  
   152  func (c *Catalog) closeCatalogFile() error {
   153  	if c.mu.catalogFile == nil {
   154  		return nil
   155  	}
   156  	err1 := c.mu.catalogRecWriter.Close()
   157  	err2 := c.mu.catalogFile.Close()
   158  	c.mu.catalogRecWriter = nil
   159  	c.mu.catalogFile = nil
   160  	if err1 != nil {
   161  		return err1
   162  	}
   163  	return err2
   164  }
   165  
   166  // Batch is used to perform multiple object additions/deletions at once.
   167  type Batch struct {
   168  	ve VersionEdit
   169  }
   170  
   171  // AddObject adds a new object to the batch.
   172  //
   173  // The given FileNum must be new - it must not match that of any object that was
   174  // ever in the catalog.
   175  func (b *Batch) AddObject(meta RemoteObjectMetadata) {
   176  	b.ve.NewObjects = append(b.ve.NewObjects, meta)
   177  }
   178  
   179  // DeleteObject adds an object removal to the batch.
   180  func (b *Batch) DeleteObject(fileNum base.DiskFileNum) {
   181  	b.ve.DeletedObjects = append(b.ve.DeletedObjects, fileNum)
   182  }
   183  
   184  // Reset clears the batch.
   185  func (b *Batch) Reset() {
   186  	b.ve.NewObjects = b.ve.NewObjects[:0]
   187  	b.ve.DeletedObjects = b.ve.DeletedObjects[:0]
   188  }
   189  
   190  // IsEmpty returns true if the batch is empty.
   191  func (b *Batch) IsEmpty() bool {
   192  	return len(b.ve.NewObjects) == 0 && len(b.ve.DeletedObjects) == 0
   193  }
   194  
   195  // Copy returns a copy of the Batch.
   196  func (b *Batch) Copy() Batch {
   197  	var res Batch
   198  	if len(b.ve.NewObjects) > 0 {
   199  		res.ve.NewObjects = make([]RemoteObjectMetadata, len(b.ve.NewObjects))
   200  		copy(res.ve.NewObjects, b.ve.NewObjects)
   201  	}
   202  	if len(b.ve.DeletedObjects) > 0 {
   203  		res.ve.DeletedObjects = make([]base.DiskFileNum, len(b.ve.DeletedObjects))
   204  		copy(res.ve.DeletedObjects, b.ve.DeletedObjects)
   205  	}
   206  	return res
   207  }
   208  
   209  // Append merges two batches.
   210  func (b *Batch) Append(other Batch) {
   211  	b.ve.NewObjects = append(b.ve.NewObjects, other.ve.NewObjects...)
   212  	b.ve.DeletedObjects = append(b.ve.DeletedObjects, other.ve.DeletedObjects...)
   213  }
   214  
   215  // ApplyBatch applies a batch of updates; returns after the change is stably
   216  // recorded on storage.
   217  func (c *Catalog) ApplyBatch(b Batch) error {
   218  	c.mu.Lock()
   219  	defer c.mu.Unlock()
   220  
   221  	// Sanity checks.
   222  	toAdd := make(map[base.DiskFileNum]struct{}, len(b.ve.NewObjects))
   223  	exists := func(n base.DiskFileNum) bool {
   224  		_, ok := c.mu.objects[n]
   225  		if !ok {
   226  			_, ok = toAdd[n]
   227  		}
   228  		return ok
   229  	}
   230  	for _, meta := range b.ve.NewObjects {
   231  		if exists(meta.FileNum) {
   232  			return errors.AssertionFailedf("adding existing object %s", meta.FileNum)
   233  		}
   234  		toAdd[meta.FileNum] = struct{}{}
   235  	}
   236  	for _, n := range b.ve.DeletedObjects {
   237  		if !exists(n) {
   238  			return errors.AssertionFailedf("deleting non-existent object %s", n)
   239  		}
   240  	}
   241  
   242  	if err := c.writeToCatalogFileLocked(&b.ve); err != nil {
   243  		return errors.Wrapf(err, "pebble: could not write to remote object catalog: %v", err)
   244  	}
   245  
   246  	// Add new objects before deleting any objects. This allows for cases where
   247  	// the same batch adds and deletes an object.
   248  	for _, meta := range b.ve.NewObjects {
   249  		c.mu.objects[meta.FileNum] = meta
   250  	}
   251  	for _, n := range b.ve.DeletedObjects {
   252  		delete(c.mu.objects, n)
   253  	}
   254  
   255  	return nil
   256  }
   257  
   258  func (c *Catalog) loadFromCatalogFile(filename string) error {
   259  	catalogPath := c.fs.PathJoin(c.dirname, filename)
   260  	f, err := c.fs.Open(catalogPath)
   261  	if err != nil {
   262  		return errors.Wrapf(
   263  			err, "pebble: could not open remote object catalog file %q for DB %q",
   264  			errors.Safe(filename), c.dirname,
   265  		)
   266  	}
   267  	defer f.Close()
   268  	rr := record.NewReader(f, 0 /* logNum */)
   269  	for {
   270  		r, err := rr.Next()
   271  		if err == io.EOF || record.IsInvalidRecord(err) {
   272  			break
   273  		}
   274  		if err != nil {
   275  			return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q",
   276  				errors.Safe(filename))
   277  		}
   278  		var ve VersionEdit
   279  		if err := ve.Decode(r); err != nil {
   280  			return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q",
   281  				errors.Safe(filename))
   282  		}
   283  		// Apply the version edit to the current state.
   284  		if err := ve.Apply(&c.mu.creatorID, c.mu.objects); err != nil {
   285  			return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q",
   286  				errors.Safe(filename))
   287  		}
   288  	}
   289  	return nil
   290  }
   291  
   292  // writeToCatalogFileLocked writes a VersionEdit to the catalog file.
   293  // Creates a new file if this is the first write.
   294  func (c *Catalog) writeToCatalogFileLocked(ve *VersionEdit) error {
   295  	c.mu.rotationHelper.AddRecord(int64(len(ve.NewObjects) + len(ve.DeletedObjects)))
   296  	snapshotSize := int64(len(c.mu.objects))
   297  
   298  	var shouldRotate bool
   299  	if c.mu.catalogFile == nil {
   300  		shouldRotate = true
   301  	} else if c.mu.catalogRecWriter.Size() >= rotateFileSize {
   302  		shouldRotate = c.mu.rotationHelper.ShouldRotate(snapshotSize)
   303  	}
   304  
   305  	if shouldRotate {
   306  		if c.mu.catalogFile != nil {
   307  			if err := c.closeCatalogFile(); err != nil {
   308  				return err
   309  			}
   310  		}
   311  		if err := c.createNewCatalogFileLocked(); err != nil {
   312  			return err
   313  		}
   314  		c.mu.rotationHelper.Rotate(snapshotSize)
   315  	}
   316  	return writeRecord(ve, c.mu.catalogFile, c.mu.catalogRecWriter)
   317  }
   318  
   319  func makeCatalogFilename(iter uint64) string {
   320  	return fmt.Sprintf("%s-%06d", catalogFilenameBase, iter)
   321  }
   322  
   323  // createNewCatalogFileLocked creates a new catalog file, populates it with the
   324  // current catalog and sets c.mu.catalogFile and c.mu.catalogRecWriter.
   325  func (c *Catalog) createNewCatalogFileLocked() (outErr error) {
   326  	if c.mu.catalogFile != nil {
   327  		return errors.AssertionFailedf("catalogFile already open")
   328  	}
   329  	filename := makeCatalogFilename(c.mu.marker.NextIter())
   330  	filepath := c.fs.PathJoin(c.dirname, filename)
   331  	file, err := c.fs.Create(filepath)
   332  	if err != nil {
   333  		return err
   334  	}
   335  	recWriter := record.NewWriter(file)
   336  	err = func() error {
   337  		// Create a VersionEdit that gets us from an empty catalog to the current state.
   338  		var ve VersionEdit
   339  		ve.CreatorID = c.mu.creatorID
   340  		ve.NewObjects = make([]RemoteObjectMetadata, 0, len(c.mu.objects))
   341  		for _, meta := range c.mu.objects {
   342  			ve.NewObjects = append(ve.NewObjects, meta)
   343  		}
   344  		if err := writeRecord(&ve, file, recWriter); err != nil {
   345  			return err
   346  		}
   347  
   348  		// Move the marker to the new filename. Move handles syncing the data
   349  		// directory as well.
   350  		if err := c.mu.marker.Move(filename); err != nil {
   351  			return errors.Wrap(err, "moving marker")
   352  		}
   353  
   354  		return nil
   355  	}()
   356  
   357  	if err != nil {
   358  		_ = recWriter.Close()
   359  		_ = file.Close()
   360  		_ = c.fs.Remove(filepath)
   361  		return err
   362  	}
   363  
   364  	// Remove any previous file (ignoring any error).
   365  	if c.mu.catalogFilename != "" {
   366  		_ = c.fs.Remove(c.fs.PathJoin(c.dirname, c.mu.catalogFilename))
   367  	}
   368  
   369  	c.mu.catalogFile = file
   370  	c.mu.catalogRecWriter = recWriter
   371  	c.mu.catalogFilename = filename
   372  	return nil
   373  }
   374  
   375  func writeRecord(ve *VersionEdit, file vfs.File, recWriter *record.Writer) error {
   376  	w, err := recWriter.Next()
   377  	if err != nil {
   378  		return err
   379  	}
   380  	if err := ve.Encode(w); err != nil {
   381  		return err
   382  	}
   383  	if err := recWriter.Flush(); err != nil {
   384  		return err
   385  	}
   386  	return file.Sync()
   387  }