github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/objstorage/objstorageprovider/remoteobjcat/catalog.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package remoteobjcat 6 7 import ( 8 "fmt" 9 "io" 10 "sort" 11 "sync" 12 13 "github.com/cockroachdb/errors" 14 "github.com/cockroachdb/pebble/internal/base" 15 "github.com/cockroachdb/pebble/objstorage" 16 "github.com/cockroachdb/pebble/objstorage/remote" 17 "github.com/cockroachdb/pebble/record" 18 "github.com/cockroachdb/pebble/vfs" 19 "github.com/cockroachdb/pebble/vfs/atomicfs" 20 ) 21 22 // Catalog is used to manage the on-disk remote object catalog. 23 // 24 // The catalog file is a log of records, where each record is an encoded 25 // VersionEdit. 26 type Catalog struct { 27 fs vfs.FS 28 dirname string 29 mu struct { 30 sync.Mutex 31 32 creatorID objstorage.CreatorID 33 objects map[base.DiskFileNum]RemoteObjectMetadata 34 35 marker *atomicfs.Marker 36 37 catalogFile vfs.File 38 catalogRecWriter *record.Writer 39 40 rotationHelper record.RotationHelper 41 42 // catalogFilename is the filename of catalogFile when catalogFile != nil, otherwise 43 // it is the filename of the last catalog file. 44 catalogFilename string 45 } 46 } 47 48 // RemoteObjectMetadata encapsulates the data stored in the catalog file for each object. 49 type RemoteObjectMetadata struct { 50 // FileNum is the identifier for the object within the context of a single DB 51 // instance. 52 FileNum base.DiskFileNum 53 // FileType is the type of the object. Only certain FileTypes are possible. 54 FileType base.FileType 55 // CreatorID identifies the DB instance that originally created the object. 56 CreatorID objstorage.CreatorID 57 // CreatorFileNum is the identifier for the object within the context of the 58 // DB instance that originally created the object. 59 CreatorFileNum base.DiskFileNum 60 // CleanupMethod indicates the method for cleaning up unused shared objects. 61 CleanupMethod objstorage.SharedCleanupMethod 62 // Locator identifies a remote.Storage implementation. 63 Locator remote.Locator 64 // CustomObjectName (if it is set) overrides the object name that is normally 65 // derived from the CreatorID and CreatorFileNum. 66 CustomObjectName string 67 } 68 69 const ( 70 catalogFilenameBase = "REMOTE-OBJ-CATALOG" 71 catalogMarkerName = "remote-obj-catalog" 72 73 // We create a new file when the size exceeds 1MB (and some other conditions 74 // hold; see record.RotationHelper). 75 rotateFileSize = 1024 * 1024 // 1MB 76 ) 77 78 // CatalogContents contains the remote objects in the catalog. 79 type CatalogContents struct { 80 // CreatorID, if it is set. 81 CreatorID objstorage.CreatorID 82 Objects []RemoteObjectMetadata 83 } 84 85 // Open creates a Catalog and loads any existing catalog file, returning the 86 // creator ID (if it is set) and the contents. 87 func Open(fs vfs.FS, dirname string) (*Catalog, CatalogContents, error) { 88 c := &Catalog{ 89 fs: fs, 90 dirname: dirname, 91 } 92 c.mu.objects = make(map[base.DiskFileNum]RemoteObjectMetadata) 93 94 var err error 95 c.mu.marker, c.mu.catalogFilename, err = atomicfs.LocateMarker(fs, dirname, catalogMarkerName) 96 if err != nil { 97 return nil, CatalogContents{}, err 98 } 99 // If the filename is empty, there is no existing catalog. 100 if c.mu.catalogFilename != "" { 101 if err := c.loadFromCatalogFile(c.mu.catalogFilename); err != nil { 102 return nil, CatalogContents{}, err 103 } 104 if err := c.mu.marker.RemoveObsolete(); err != nil { 105 return nil, CatalogContents{}, err 106 } 107 // TODO(radu): remove obsolete catalog files. 108 } 109 res := CatalogContents{ 110 CreatorID: c.mu.creatorID, 111 Objects: make([]RemoteObjectMetadata, 0, len(c.mu.objects)), 112 } 113 for _, meta := range c.mu.objects { 114 res.Objects = append(res.Objects, meta) 115 } 116 // Sort the objects so the function is deterministic. 117 sort.Slice(res.Objects, func(i, j int) bool { 118 return res.Objects[i].FileNum.FileNum() < res.Objects[j].FileNum.FileNum() 119 }) 120 return c, res, nil 121 } 122 123 // SetCreatorID sets the creator ID. If it is already set, it must match. 124 func (c *Catalog) SetCreatorID(id objstorage.CreatorID) error { 125 if !id.IsSet() { 126 return errors.AssertionFailedf("attempt to unset CreatorID") 127 } 128 129 c.mu.Lock() 130 defer c.mu.Unlock() 131 132 if c.mu.creatorID.IsSet() { 133 if c.mu.creatorID != id { 134 return errors.AssertionFailedf("attempt to change CreatorID from %s to %s", c.mu.creatorID, id) 135 } 136 return nil 137 } 138 139 ve := VersionEdit{CreatorID: id} 140 if err := c.writeToCatalogFileLocked(&ve); err != nil { 141 return errors.Wrapf(err, "pebble: could not write to remote object catalog: %v", err) 142 } 143 c.mu.creatorID = id 144 return nil 145 } 146 147 // Close any open files. 148 func (c *Catalog) Close() error { 149 return c.closeCatalogFile() 150 } 151 152 func (c *Catalog) closeCatalogFile() error { 153 if c.mu.catalogFile == nil { 154 return nil 155 } 156 err1 := c.mu.catalogRecWriter.Close() 157 err2 := c.mu.catalogFile.Close() 158 c.mu.catalogRecWriter = nil 159 c.mu.catalogFile = nil 160 if err1 != nil { 161 return err1 162 } 163 return err2 164 } 165 166 // Batch is used to perform multiple object additions/deletions at once. 167 type Batch struct { 168 ve VersionEdit 169 } 170 171 // AddObject adds a new object to the batch. 172 // 173 // The given FileNum must be new - it must not match that of any object that was 174 // ever in the catalog. 175 func (b *Batch) AddObject(meta RemoteObjectMetadata) { 176 b.ve.NewObjects = append(b.ve.NewObjects, meta) 177 } 178 179 // DeleteObject adds an object removal to the batch. 180 func (b *Batch) DeleteObject(fileNum base.DiskFileNum) { 181 b.ve.DeletedObjects = append(b.ve.DeletedObjects, fileNum) 182 } 183 184 // Reset clears the batch. 185 func (b *Batch) Reset() { 186 b.ve.NewObjects = b.ve.NewObjects[:0] 187 b.ve.DeletedObjects = b.ve.DeletedObjects[:0] 188 } 189 190 // IsEmpty returns true if the batch is empty. 191 func (b *Batch) IsEmpty() bool { 192 return len(b.ve.NewObjects) == 0 && len(b.ve.DeletedObjects) == 0 193 } 194 195 // Copy returns a copy of the Batch. 196 func (b *Batch) Copy() Batch { 197 var res Batch 198 if len(b.ve.NewObjects) > 0 { 199 res.ve.NewObjects = make([]RemoteObjectMetadata, len(b.ve.NewObjects)) 200 copy(res.ve.NewObjects, b.ve.NewObjects) 201 } 202 if len(b.ve.DeletedObjects) > 0 { 203 res.ve.DeletedObjects = make([]base.DiskFileNum, len(b.ve.DeletedObjects)) 204 copy(res.ve.DeletedObjects, b.ve.DeletedObjects) 205 } 206 return res 207 } 208 209 // Append merges two batches. 210 func (b *Batch) Append(other Batch) { 211 b.ve.NewObjects = append(b.ve.NewObjects, other.ve.NewObjects...) 212 b.ve.DeletedObjects = append(b.ve.DeletedObjects, other.ve.DeletedObjects...) 213 } 214 215 // ApplyBatch applies a batch of updates; returns after the change is stably 216 // recorded on storage. 217 func (c *Catalog) ApplyBatch(b Batch) error { 218 c.mu.Lock() 219 defer c.mu.Unlock() 220 221 // Sanity checks. 222 toAdd := make(map[base.DiskFileNum]struct{}, len(b.ve.NewObjects)) 223 exists := func(n base.DiskFileNum) bool { 224 _, ok := c.mu.objects[n] 225 if !ok { 226 _, ok = toAdd[n] 227 } 228 return ok 229 } 230 for _, meta := range b.ve.NewObjects { 231 if exists(meta.FileNum) { 232 return errors.AssertionFailedf("adding existing object %s", meta.FileNum) 233 } 234 toAdd[meta.FileNum] = struct{}{} 235 } 236 for _, n := range b.ve.DeletedObjects { 237 if !exists(n) { 238 return errors.AssertionFailedf("deleting non-existent object %s", n) 239 } 240 } 241 242 if err := c.writeToCatalogFileLocked(&b.ve); err != nil { 243 return errors.Wrapf(err, "pebble: could not write to remote object catalog: %v", err) 244 } 245 246 // Add new objects before deleting any objects. This allows for cases where 247 // the same batch adds and deletes an object. 248 for _, meta := range b.ve.NewObjects { 249 c.mu.objects[meta.FileNum] = meta 250 } 251 for _, n := range b.ve.DeletedObjects { 252 delete(c.mu.objects, n) 253 } 254 255 return nil 256 } 257 258 func (c *Catalog) loadFromCatalogFile(filename string) error { 259 catalogPath := c.fs.PathJoin(c.dirname, filename) 260 f, err := c.fs.Open(catalogPath) 261 if err != nil { 262 return errors.Wrapf( 263 err, "pebble: could not open remote object catalog file %q for DB %q", 264 errors.Safe(filename), c.dirname, 265 ) 266 } 267 defer f.Close() 268 rr := record.NewReader(f, 0 /* logNum */) 269 for { 270 r, err := rr.Next() 271 if err == io.EOF || record.IsInvalidRecord(err) { 272 break 273 } 274 if err != nil { 275 return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q", 276 errors.Safe(filename)) 277 } 278 var ve VersionEdit 279 if err := ve.Decode(r); err != nil { 280 return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q", 281 errors.Safe(filename)) 282 } 283 // Apply the version edit to the current state. 284 if err := ve.Apply(&c.mu.creatorID, c.mu.objects); err != nil { 285 return errors.Wrapf(err, "pebble: error when loading remote object catalog file %q", 286 errors.Safe(filename)) 287 } 288 } 289 return nil 290 } 291 292 // writeToCatalogFileLocked writes a VersionEdit to the catalog file. 293 // Creates a new file if this is the first write. 294 func (c *Catalog) writeToCatalogFileLocked(ve *VersionEdit) error { 295 c.mu.rotationHelper.AddRecord(int64(len(ve.NewObjects) + len(ve.DeletedObjects))) 296 snapshotSize := int64(len(c.mu.objects)) 297 298 var shouldRotate bool 299 if c.mu.catalogFile == nil { 300 shouldRotate = true 301 } else if c.mu.catalogRecWriter.Size() >= rotateFileSize { 302 shouldRotate = c.mu.rotationHelper.ShouldRotate(snapshotSize) 303 } 304 305 if shouldRotate { 306 if c.mu.catalogFile != nil { 307 if err := c.closeCatalogFile(); err != nil { 308 return err 309 } 310 } 311 if err := c.createNewCatalogFileLocked(); err != nil { 312 return err 313 } 314 c.mu.rotationHelper.Rotate(snapshotSize) 315 } 316 return writeRecord(ve, c.mu.catalogFile, c.mu.catalogRecWriter) 317 } 318 319 func makeCatalogFilename(iter uint64) string { 320 return fmt.Sprintf("%s-%06d", catalogFilenameBase, iter) 321 } 322 323 // createNewCatalogFileLocked creates a new catalog file, populates it with the 324 // current catalog and sets c.mu.catalogFile and c.mu.catalogRecWriter. 325 func (c *Catalog) createNewCatalogFileLocked() (outErr error) { 326 if c.mu.catalogFile != nil { 327 return errors.AssertionFailedf("catalogFile already open") 328 } 329 filename := makeCatalogFilename(c.mu.marker.NextIter()) 330 filepath := c.fs.PathJoin(c.dirname, filename) 331 file, err := c.fs.Create(filepath) 332 if err != nil { 333 return err 334 } 335 recWriter := record.NewWriter(file) 336 err = func() error { 337 // Create a VersionEdit that gets us from an empty catalog to the current state. 338 var ve VersionEdit 339 ve.CreatorID = c.mu.creatorID 340 ve.NewObjects = make([]RemoteObjectMetadata, 0, len(c.mu.objects)) 341 for _, meta := range c.mu.objects { 342 ve.NewObjects = append(ve.NewObjects, meta) 343 } 344 if err := writeRecord(&ve, file, recWriter); err != nil { 345 return err 346 } 347 348 // Move the marker to the new filename. Move handles syncing the data 349 // directory as well. 350 if err := c.mu.marker.Move(filename); err != nil { 351 return errors.Wrap(err, "moving marker") 352 } 353 354 return nil 355 }() 356 357 if err != nil { 358 _ = recWriter.Close() 359 _ = file.Close() 360 _ = c.fs.Remove(filepath) 361 return err 362 } 363 364 // Remove any previous file (ignoring any error). 365 if c.mu.catalogFilename != "" { 366 _ = c.fs.Remove(c.fs.PathJoin(c.dirname, c.mu.catalogFilename)) 367 } 368 369 c.mu.catalogFile = file 370 c.mu.catalogRecWriter = recWriter 371 c.mu.catalogFilename = filename 372 return nil 373 } 374 375 func writeRecord(ve *VersionEdit, file vfs.File, recWriter *record.Writer) error { 376 w, err := recWriter.Next() 377 if err != nil { 378 return err 379 } 380 if err := ve.Encode(w); err != nil { 381 return err 382 } 383 if err := recWriter.Flush(); err != nil { 384 return err 385 } 386 return file.Sync() 387 }