github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/objstorage/objstorageprovider/remoteobjcat/catalog.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package remoteobjcat 6 7 import ( 8 "cmp" 9 "fmt" 10 "io" 11 "slices" 12 "sync" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/pebble/internal/base" 16 "github.com/cockroachdb/pebble/objstorage" 17 "github.com/cockroachdb/pebble/objstorage/remote" 18 "github.com/cockroachdb/pebble/record" 19 "github.com/cockroachdb/pebble/vfs" 20 "github.com/cockroachdb/pebble/vfs/atomicfs" 21 ) 22 23 // Catalog is used to manage the on-disk remote object catalog. 24 // 25 // The catalog file is a log of records, where each record is an encoded 26 // VersionEdit. 27 type Catalog struct { 28 fs vfs.FS 29 dirname string 30 mu struct { 31 sync.Mutex 32 33 creatorID objstorage.CreatorID 34 objects map[base.DiskFileNum]RemoteObjectMetadata 35 36 marker *atomicfs.Marker 37 38 catalogFile vfs.File 39 catalogRecWriter *record.Writer 40 41 rotationHelper record.RotationHelper 42 43 // catalogFilename is the filename of catalogFile when catalogFile != nil, otherwise 44 // it is the filename of the last catalog file. 45 catalogFilename string 46 } 47 } 48 49 // RemoteObjectMetadata encapsulates the data stored in the catalog file for each object. 50 type RemoteObjectMetadata struct { 51 // FileNum is the identifier for the object within the context of a single DB 52 // instance. 53 FileNum base.DiskFileNum 54 // FileType is the type of the object. Only certain FileTypes are possible. 55 FileType base.FileType 56 // CreatorID identifies the DB instance that originally created the object. 57 CreatorID objstorage.CreatorID 58 // CreatorFileNum is the identifier for the object within the context of the 59 // DB instance that originally created the object. 60 CreatorFileNum base.DiskFileNum 61 // CleanupMethod indicates the method for cleaning up unused shared objects. 62 CleanupMethod objstorage.SharedCleanupMethod 63 // Locator identifies a remote.Storage implementation. 64 Locator remote.Locator 65 // CustomObjectName (if it is set) overrides the object name that is normally 66 // derived from the CreatorID and CreatorFileNum. 67 CustomObjectName string 68 } 69 70 const ( 71 catalogFilenameBase = "REMOTE-OBJ-CATALOG" 72 catalogMarkerName = "remote-obj-catalog" 73 74 // We create a new file when the size exceeds 1MB (and some other conditions 75 // hold; see record.RotationHelper). 76 rotateFileSize = 1024 * 1024 // 1MB 77 ) 78 79 // CatalogContents contains the remote objects in the catalog. 80 type CatalogContents struct { 81 // CreatorID, if it is set. 82 CreatorID objstorage.CreatorID 83 Objects []RemoteObjectMetadata 84 } 85 86 // Open creates a Catalog and loads any existing catalog file, returning the 87 // creator ID (if it is set) and the contents. 88 func Open(fs vfs.FS, dirname string) (*Catalog, CatalogContents, error) { 89 c := &Catalog{ 90 fs: fs, 91 dirname: dirname, 92 } 93 c.mu.objects = make(map[base.DiskFileNum]RemoteObjectMetadata) 94 95 var err error 96 c.mu.marker, c.mu.catalogFilename, err = atomicfs.LocateMarker(fs, dirname, catalogMarkerName) 97 if err != nil { 98 return nil, CatalogContents{}, err 99 } 100 // If the filename is empty, there is no existing catalog. 101 if c.mu.catalogFilename != "" { 102 if err := c.loadFromCatalogFile(c.mu.catalogFilename); err != nil { 103 return nil, CatalogContents{}, err 104 } 105 if err := c.mu.marker.RemoveObsolete(); err != nil { 106 return nil, CatalogContents{}, err 107 } 108 // TODO(radu): remove obsolete catalog files. 109 } 110 res := CatalogContents{ 111 CreatorID: c.mu.creatorID, 112 Objects: make([]RemoteObjectMetadata, 0, len(c.mu.objects)), 113 } 114 for _, meta := range c.mu.objects { 115 res.Objects = append(res.Objects, meta) 116 } 117 // Sort the objects so the function is deterministic. 118 slices.SortFunc(res.Objects, func(a, b RemoteObjectMetadata) int { 119 return cmp.Compare(a.FileNum, b.FileNum) 120 }) 121 return c, res, nil 122 } 123 124 // SetCreatorID sets the creator ID. If it is already set, it must match. 125 func (c *Catalog) SetCreatorID(id objstorage.CreatorID) error { 126 if !id.IsSet() { 127 return errors.AssertionFailedf("attempt to unset CreatorID") 128 } 129 130 c.mu.Lock() 131 defer c.mu.Unlock() 132 133 if c.mu.creatorID.IsSet() { 134 if c.mu.creatorID != id { 135 return errors.AssertionFailedf("attempt to change CreatorID from %s to %s", c.mu.creatorID, id) 136 } 137 return nil 138 } 139 140 ve := VersionEdit{CreatorID: id} 141 if err := c.writeToCatalogFileLocked(&ve); err != nil { 142 return errors.Wrapf(err, "pebble: could not write to remote object catalog") 143 } 144 c.mu.creatorID = id 145 return nil 146 } 147 148 // Close any open files. 149 func (c *Catalog) Close() error { 150 return c.closeCatalogFile() 151 } 152 153 func (c *Catalog) closeCatalogFile() error { 154 if c.mu.catalogFile == nil { 155 return nil 156 } 157 err1 := c.mu.catalogRecWriter.Close() 158 err2 := c.mu.catalogFile.Close() 159 c.mu.catalogRecWriter = nil 160 c.mu.catalogFile = nil 161 if err1 != nil { 162 return err1 163 } 164 return err2 165 } 166 167 // Batch is used to perform multiple object additions/deletions at once. 168 type Batch struct { 169 ve VersionEdit 170 } 171 172 // AddObject adds a new object to the batch. 173 // 174 // The given FileNum must be new - it must not match that of any object that was 175 // ever in the catalog. 176 func (b *Batch) AddObject(meta RemoteObjectMetadata) { 177 b.ve.NewObjects = append(b.ve.NewObjects, meta) 178 } 179 180 // DeleteObject adds an object removal to the batch. 181 func (b *Batch) DeleteObject(fileNum base.DiskFileNum) { 182 b.ve.DeletedObjects = append(b.ve.DeletedObjects, fileNum) 183 } 184 185 // Reset clears the batch. 186 func (b *Batch) Reset() { 187 b.ve.NewObjects = b.ve.NewObjects[:0] 188 b.ve.DeletedObjects = b.ve.DeletedObjects[:0] 189 } 190 191 // IsEmpty returns true if the batch is empty. 192 func (b *Batch) IsEmpty() bool { 193 return len(b.ve.NewObjects) == 0 && len(b.ve.DeletedObjects) == 0 194 } 195 196 // Copy returns a copy of the Batch. 197 func (b *Batch) Copy() Batch { 198 var res Batch 199 if len(b.ve.NewObjects) > 0 { 200 res.ve.NewObjects = make([]RemoteObjectMetadata, len(b.ve.NewObjects)) 201 copy(res.ve.NewObjects, b.ve.NewObjects) 202 } 203 if len(b.ve.DeletedObjects) > 0 { 204 res.ve.DeletedObjects = make([]base.DiskFileNum, len(b.ve.DeletedObjects)) 205 copy(res.ve.DeletedObjects, b.ve.DeletedObjects) 206 } 207 return res 208 } 209 210 // Append merges two batches. 211 func (b *Batch) Append(other Batch) { 212 b.ve.NewObjects = append(b.ve.NewObjects, other.ve.NewObjects...) 213 b.ve.DeletedObjects = append(b.ve.DeletedObjects, other.ve.DeletedObjects...) 214 } 215 216 // ApplyBatch applies a batch of updates; returns after the change is stably 217 // recorded on storage. 218 func (c *Catalog) ApplyBatch(b Batch) error { 219 c.mu.Lock() 220 defer c.mu.Unlock() 221 222 // Sanity checks. 223 toAdd := make(map[base.DiskFileNum]struct{}, len(b.ve.NewObjects)) 224 exists := func(n base.DiskFileNum) bool { 225 _, ok := c.mu.objects[n] 226 if !ok { 227 _, ok = toAdd[n] 228 } 229 return ok 230 } 231 for _, meta := range b.ve.NewObjects { 232 if exists(meta.FileNum) { 233 return errors.AssertionFailedf("adding existing object %s", meta.FileNum) 234 } 235 toAdd[meta.FileNum] = struct{}{} 236 } 237 for _, n := range b.ve.DeletedObjects { 238 if !exists(n) { 239 return errors.AssertionFailedf("deleting non-existent object %s", n) 240 } 241 } 242 243 if err := c.writeToCatalogFileLocked(&b.ve); err != nil { 244 return errors.Wrapf(err, "pebble: could not write to remote object catalog") 245 } 246 247 // Add new objects before deleting any objects. This allows for cases where 248 // the same batch adds and deletes an object. 249 for _, meta := range b.ve.NewObjects { 250 c.mu.objects[meta.FileNum] = meta 251 } 252 for _, n := range b.ve.DeletedObjects { 253 delete(c.mu.objects, n) 254 } 255 256 return nil 257 } 258 259 func (c *Catalog) loadFromCatalogFile(filename string) error { 260 catalogPath := c.fs.PathJoin(c.dirname, filename) 261 f, err := c.fs.Open(catalogPath) 262 if err != nil { 263 return errors.Wrapf( 264 err, "pebble: could not open remote object catalog file %q for DB %q", 265 errors.Safe(filename), c.dirname, 266 ) 267 } 268 defer f.Close() 269 rr := record.NewReader(f, 0 /* logNum */) 270 for { 271 r, err := rr.Next() 272 if err == io.EOF || record.IsInvalidRecord(err) { 273 break 274 } 275 if err != nil { 276 return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q", 277 errors.Safe(filename)) 278 } 279 var ve VersionEdit 280 if err := ve.Decode(r); err != nil { 281 return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q", 282 errors.Safe(filename)) 283 } 284 // Apply the version edit to the current state. 285 if err := ve.Apply(&c.mu.creatorID, c.mu.objects); err != nil { 286 return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q", 287 errors.Safe(filename)) 288 } 289 } 290 return nil 291 } 292 293 // writeToCatalogFileLocked writes a VersionEdit to the catalog file. 294 // Creates a new file if this is the first write. 295 func (c *Catalog) writeToCatalogFileLocked(ve *VersionEdit) error { 296 c.mu.rotationHelper.AddRecord(int64(len(ve.NewObjects) + len(ve.DeletedObjects))) 297 snapshotSize := int64(len(c.mu.objects)) 298 299 var shouldRotate bool 300 if c.mu.catalogFile == nil { 301 shouldRotate = true 302 } else if c.mu.catalogRecWriter.Size() >= rotateFileSize { 303 shouldRotate = c.mu.rotationHelper.ShouldRotate(snapshotSize) 304 } 305 306 if shouldRotate { 307 if c.mu.catalogFile != nil { 308 if err := c.closeCatalogFile(); err != nil { 309 return err 310 } 311 } 312 if err := c.createNewCatalogFileLocked(); err != nil { 313 return err 314 } 315 c.mu.rotationHelper.Rotate(snapshotSize) 316 } 317 return writeRecord(ve, c.mu.catalogFile, c.mu.catalogRecWriter) 318 } 319 320 func makeCatalogFilename(iter uint64) string { 321 return fmt.Sprintf("%s-%06d", catalogFilenameBase, iter) 322 } 323 324 // createNewCatalogFileLocked creates a new catalog file, populates it with the 325 // current catalog and sets c.mu.catalogFile and c.mu.catalogRecWriter. 326 func (c *Catalog) createNewCatalogFileLocked() (outErr error) { 327 if c.mu.catalogFile != nil { 328 return errors.AssertionFailedf("catalogFile already open") 329 } 330 filename := makeCatalogFilename(c.mu.marker.NextIter()) 331 filepath := c.fs.PathJoin(c.dirname, filename) 332 file, err := c.fs.Create(filepath) 333 if err != nil { 334 return err 335 } 336 recWriter := record.NewWriter(file) 337 err = func() error { 338 // Create a VersionEdit that gets us from an empty catalog to the current state. 339 var ve VersionEdit 340 ve.CreatorID = c.mu.creatorID 341 ve.NewObjects = make([]RemoteObjectMetadata, 0, len(c.mu.objects)) 342 for _, meta := range c.mu.objects { 343 ve.NewObjects = append(ve.NewObjects, meta) 344 } 345 if err := writeRecord(&ve, file, recWriter); err != nil { 346 return err 347 } 348 349 // Move the marker to the new filename. Move handles syncing the data 350 // directory as well. 351 if err := c.mu.marker.Move(filename); err != nil { 352 return errors.Wrap(err, "moving marker") 353 } 354 355 return nil 356 }() 357 358 if err != nil { 359 _ = recWriter.Close() 360 _ = file.Close() 361 _ = c.fs.Remove(filepath) 362 return err 363 } 364 365 // Remove any previous file (ignoring any error). 366 if c.mu.catalogFilename != "" { 367 _ = c.fs.Remove(c.fs.PathJoin(c.dirname, c.mu.catalogFilename)) 368 } 369 370 c.mu.catalogFile = file 371 c.mu.catalogRecWriter = recWriter 372 c.mu.catalogFilename = filename 373 return nil 374 } 375 376 func writeRecord(ve *VersionEdit, file vfs.File, recWriter *record.Writer) error { 377 w, err := recWriter.Next() 378 if err != nil { 379 return err 380 } 381 if err := ve.Encode(w); err != nil { 382 return err 383 } 384 if err := recWriter.Flush(); err != nil { 385 return err 386 } 387 return file.Sync() 388 }