github.com/lalkh/containerd@v1.4.3/metadata/db.go (about) 1 /* 2 Copyright The containerd Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metadata 18 19 import ( 20 "context" 21 "encoding/binary" 22 "strings" 23 "sync" 24 "sync/atomic" 25 "time" 26 27 "github.com/containerd/containerd/content" 28 "github.com/containerd/containerd/gc" 29 "github.com/containerd/containerd/log" 30 "github.com/containerd/containerd/snapshots" 31 "github.com/pkg/errors" 32 bolt "go.etcd.io/bbolt" 33 ) 34 35 const ( 36 // schemaVersion represents the schema version of 37 // the database. This schema version represents the 38 // structure of the data in the database. The schema 39 // can envolve at any time but any backwards 40 // incompatible changes or structural changes require 41 // bumping the schema version. 42 schemaVersion = "v1" 43 44 // dbVersion represents updates to the schema 45 // version which are additions and compatible with 46 // prior version of the same schema. 47 dbVersion = 3 48 ) 49 50 // DBOpt configures how we set up the DB 51 type DBOpt func(*dbOptions) 52 53 // WithPolicyIsolated isolates contents between namespaces 54 func WithPolicyIsolated(o *dbOptions) { 55 o.shared = false 56 } 57 58 // dbOptions configure db options. 59 type dbOptions struct { 60 shared bool 61 } 62 63 // DB represents a metadata database backed by a bolt 64 // database. The database is fully namespaced and stores 65 // image, container, namespace, snapshot, and content data 66 // while proxying data shared across namespaces to backend 67 // datastores for content and snapshots. 68 type DB struct { 69 db *bolt.DB 70 ss map[string]*snapshotter 71 cs *contentStore 72 73 // wlock is used to protect access to the data structures during garbage 74 // collection. While the wlock is held no writable transactions can be 75 // opened, preventing changes from occurring between the mark and 76 // sweep phases without preventing read transactions. 77 wlock sync.RWMutex 78 79 // dirty flag indicates that references have been removed which require 80 // a garbage collection to ensure the database is clean. This tracks 81 // the number of dirty operations. This should be updated and read 82 // atomically if outside of wlock.Lock. 83 dirty uint32 84 85 // dirtySS and dirtyCS flags keeps track of datastores which have had 86 // deletions since the last garbage collection. These datastores will 87 // be garbage collected during the next garbage collection. These 88 // should only be updated inside of a write transaction or wlock.Lock. 89 dirtySS map[string]struct{} 90 dirtyCS bool 91 92 // mutationCallbacks are called after each mutation with the flag 93 // set indicating whether any dirty flags are set 94 mutationCallbacks []func(bool) 95 96 dbopts dbOptions 97 } 98 99 // NewDB creates a new metadata database using the provided 100 // bolt database, content store, and snapshotters. 101 func NewDB(db *bolt.DB, cs content.Store, ss map[string]snapshots.Snapshotter, opts ...DBOpt) *DB { 102 m := &DB{ 103 db: db, 104 ss: make(map[string]*snapshotter, len(ss)), 105 dirtySS: map[string]struct{}{}, 106 dbopts: dbOptions{ 107 shared: true, 108 }, 109 } 110 111 for _, opt := range opts { 112 opt(&m.dbopts) 113 } 114 115 // Initialize data stores 116 m.cs = newContentStore(m, m.dbopts.shared, cs) 117 for name, sn := range ss { 118 m.ss[name] = newSnapshotter(m, name, sn) 119 } 120 121 return m 122 } 123 124 // Init ensures the database is at the correct version 125 // and performs any needed migrations. 126 func (m *DB) Init(ctx context.Context) error { 127 // errSkip is used when no migration or version needs to be written 128 // to the database and the transaction can be immediately rolled 129 // back rather than performing a much slower and unnecessary commit. 130 var errSkip = errors.New("skip update") 131 132 err := m.db.Update(func(tx *bolt.Tx) error { 133 var ( 134 // current schema and version 135 schema = "v0" 136 version = 0 137 ) 138 139 // i represents the index of the first migration 140 // which must be run to get the database up to date. 141 // The migration's version will be checked in reverse 142 // order, decrementing i for each migration which 143 // represents a version newer than the current 144 // database version 145 i := len(migrations) 146 147 for ; i > 0; i-- { 148 migration := migrations[i-1] 149 150 bkt := tx.Bucket([]byte(migration.schema)) 151 if bkt == nil { 152 // Hasn't encountered another schema, go to next migration 153 if schema == "v0" { 154 continue 155 } 156 break 157 } 158 if schema == "v0" { 159 schema = migration.schema 160 vb := bkt.Get(bucketKeyDBVersion) 161 if vb != nil { 162 v, _ := binary.Varint(vb) 163 version = int(v) 164 } 165 } 166 167 if version >= migration.version { 168 break 169 } 170 } 171 172 // Previous version of database found 173 if schema != "v0" { 174 updates := migrations[i:] 175 176 // No migration updates, return immediately 177 if len(updates) == 0 { 178 return errSkip 179 } 180 181 for _, m := range updates { 182 t0 := time.Now() 183 if err := m.migrate(tx); err != nil { 184 return errors.Wrapf(err, "failed to migrate to %s.%d", m.schema, m.version) 185 } 186 log.G(ctx).WithField("d", time.Since(t0)).Debugf("finished database migration to %s.%d", m.schema, m.version) 187 } 188 } 189 190 bkt, err := tx.CreateBucketIfNotExists(bucketKeyVersion) 191 if err != nil { 192 return err 193 } 194 195 versionEncoded, err := encodeInt(dbVersion) 196 if err != nil { 197 return err 198 } 199 200 return bkt.Put(bucketKeyDBVersion, versionEncoded) 201 }) 202 if err == errSkip { 203 err = nil 204 } 205 return err 206 } 207 208 // ContentStore returns a namespaced content store 209 // proxied to a content store. 210 func (m *DB) ContentStore() content.Store { 211 if m.cs == nil { 212 return nil 213 } 214 return m.cs 215 } 216 217 // Snapshotter returns a namespaced content store for 218 // the requested snapshotter name proxied to a snapshotter. 219 func (m *DB) Snapshotter(name string) snapshots.Snapshotter { 220 sn, ok := m.ss[name] 221 if !ok { 222 return nil 223 } 224 return sn 225 } 226 227 // Snapshotters returns all available snapshotters. 228 func (m *DB) Snapshotters() map[string]snapshots.Snapshotter { 229 ss := make(map[string]snapshots.Snapshotter, len(m.ss)) 230 for n, sn := range m.ss { 231 ss[n] = sn 232 } 233 return ss 234 } 235 236 // View runs a readonly transaction on the metadata store. 237 func (m *DB) View(fn func(*bolt.Tx) error) error { 238 return m.db.View(fn) 239 } 240 241 // Update runs a writable transaction on the metadata store. 242 func (m *DB) Update(fn func(*bolt.Tx) error) error { 243 m.wlock.RLock() 244 defer m.wlock.RUnlock() 245 err := m.db.Update(fn) 246 if err == nil { 247 dirty := atomic.LoadUint32(&m.dirty) > 0 248 for _, fn := range m.mutationCallbacks { 249 fn(dirty) 250 } 251 } 252 253 return err 254 } 255 256 // RegisterMutationCallback registers a function to be called after a metadata 257 // mutations has been performed. 258 // 259 // The callback function is an argument for whether a deletion has occurred 260 // since the last garbage collection. 261 func (m *DB) RegisterMutationCallback(fn func(bool)) { 262 m.wlock.Lock() 263 m.mutationCallbacks = append(m.mutationCallbacks, fn) 264 m.wlock.Unlock() 265 } 266 267 // GCStats holds the duration for the different phases of the garbage collector 268 type GCStats struct { 269 MetaD time.Duration 270 ContentD time.Duration 271 SnapshotD map[string]time.Duration 272 } 273 274 // Elapsed returns the duration which elapsed during a collection 275 func (s GCStats) Elapsed() time.Duration { 276 return s.MetaD 277 } 278 279 // GarbageCollect starts garbage collection 280 func (m *DB) GarbageCollect(ctx context.Context) (gc.Stats, error) { 281 m.wlock.Lock() 282 t1 := time.Now() 283 284 marked, err := m.getMarked(ctx) 285 if err != nil { 286 m.wlock.Unlock() 287 return nil, err 288 } 289 290 if err := m.db.Update(func(tx *bolt.Tx) error { 291 ctx, cancel := context.WithCancel(ctx) 292 defer cancel() 293 294 rm := func(ctx context.Context, n gc.Node) error { 295 if _, ok := marked[n]; ok { 296 return nil 297 } 298 299 if n.Type == ResourceSnapshot { 300 if idx := strings.IndexRune(n.Key, '/'); idx > 0 { 301 m.dirtySS[n.Key[:idx]] = struct{}{} 302 } 303 } else if n.Type == ResourceContent || n.Type == ResourceIngest { 304 m.dirtyCS = true 305 } 306 return remove(ctx, tx, n) 307 } 308 309 if err := scanAll(ctx, tx, rm); err != nil { 310 return errors.Wrap(err, "failed to scan and remove") 311 } 312 313 return nil 314 }); err != nil { 315 m.wlock.Unlock() 316 return nil, err 317 } 318 319 var stats GCStats 320 var wg sync.WaitGroup 321 322 // reset dirty, no need for atomic inside of wlock.Lock 323 m.dirty = 0 324 325 if len(m.dirtySS) > 0 { 326 var sl sync.Mutex 327 stats.SnapshotD = map[string]time.Duration{} 328 wg.Add(len(m.dirtySS)) 329 for snapshotterName := range m.dirtySS { 330 log.G(ctx).WithField("snapshotter", snapshotterName).Debug("schedule snapshotter cleanup") 331 go func(snapshotterName string) { 332 st1 := time.Now() 333 m.cleanupSnapshotter(snapshotterName) 334 335 sl.Lock() 336 stats.SnapshotD[snapshotterName] = time.Since(st1) 337 sl.Unlock() 338 339 wg.Done() 340 }(snapshotterName) 341 } 342 m.dirtySS = map[string]struct{}{} 343 } 344 345 if m.dirtyCS { 346 wg.Add(1) 347 log.G(ctx).Debug("schedule content cleanup") 348 go func() { 349 ct1 := time.Now() 350 m.cleanupContent() 351 stats.ContentD = time.Since(ct1) 352 wg.Done() 353 }() 354 m.dirtyCS = false 355 } 356 357 stats.MetaD = time.Since(t1) 358 m.wlock.Unlock() 359 360 wg.Wait() 361 362 return stats, err 363 } 364 365 func (m *DB) getMarked(ctx context.Context) (map[gc.Node]struct{}, error) { 366 var marked map[gc.Node]struct{} 367 if err := m.db.View(func(tx *bolt.Tx) error { 368 ctx, cancel := context.WithCancel(ctx) 369 defer cancel() 370 371 var ( 372 nodes []gc.Node 373 wg sync.WaitGroup 374 roots = make(chan gc.Node) 375 ) 376 wg.Add(1) 377 go func() { 378 defer wg.Done() 379 for n := range roots { 380 nodes = append(nodes, n) 381 } 382 }() 383 // Call roots 384 if err := scanRoots(ctx, tx, roots); err != nil { 385 cancel() 386 return err 387 } 388 close(roots) 389 wg.Wait() 390 391 refs := func(n gc.Node) ([]gc.Node, error) { 392 var sn []gc.Node 393 if err := references(ctx, tx, n, func(nn gc.Node) { 394 sn = append(sn, nn) 395 }); err != nil { 396 return nil, err 397 } 398 return sn, nil 399 } 400 401 reachable, err := gc.Tricolor(nodes, refs) 402 if err != nil { 403 return err 404 } 405 marked = reachable 406 return nil 407 }); err != nil { 408 return nil, err 409 } 410 return marked, nil 411 } 412 413 func (m *DB) cleanupSnapshotter(name string) (time.Duration, error) { 414 ctx := context.Background() 415 sn, ok := m.ss[name] 416 if !ok { 417 return 0, nil 418 } 419 420 d, err := sn.garbageCollect(ctx) 421 logger := log.G(ctx).WithField("snapshotter", name) 422 if err != nil { 423 logger.WithError(err).Warn("snapshot garbage collection failed") 424 } else { 425 logger.WithField("d", d).Debugf("snapshot garbage collected") 426 } 427 return d, err 428 } 429 430 func (m *DB) cleanupContent() (time.Duration, error) { 431 ctx := context.Background() 432 if m.cs == nil { 433 return 0, nil 434 } 435 436 d, err := m.cs.garbageCollect(ctx) 437 if err != nil { 438 log.G(ctx).WithError(err).Warn("content garbage collection failed") 439 } else { 440 log.G(ctx).WithField("d", d).Debugf("content garbage collected") 441 } 442 443 return d, err 444 }