github.com/lalkh/containerd@v1.4.3/metadata/db.go (about)

     1  /*
     2     Copyright The containerd Authors.
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package metadata
    18  
    19  import (
    20  	"context"
    21  	"encoding/binary"
    22  	"strings"
    23  	"sync"
    24  	"sync/atomic"
    25  	"time"
    26  
    27  	"github.com/containerd/containerd/content"
    28  	"github.com/containerd/containerd/gc"
    29  	"github.com/containerd/containerd/log"
    30  	"github.com/containerd/containerd/snapshots"
    31  	"github.com/pkg/errors"
    32  	bolt "go.etcd.io/bbolt"
    33  )
    34  
    35  const (
    36  	// schemaVersion represents the schema version of
    37  	// the database. This schema version represents the
    38  	// structure of the data in the database. The schema
    39  	// can envolve at any time but any backwards
    40  	// incompatible changes or structural changes require
    41  	// bumping the schema version.
    42  	schemaVersion = "v1"
    43  
    44  	// dbVersion represents updates to the schema
    45  	// version which are additions and compatible with
    46  	// prior version of the same schema.
    47  	dbVersion = 3
    48  )
    49  
    50  // DBOpt configures how we set up the DB
    51  type DBOpt func(*dbOptions)
    52  
    53  // WithPolicyIsolated isolates contents between namespaces
    54  func WithPolicyIsolated(o *dbOptions) {
    55  	o.shared = false
    56  }
    57  
    58  // dbOptions configure db options.
    59  type dbOptions struct {
    60  	shared bool
    61  }
    62  
    63  // DB represents a metadata database backed by a bolt
    64  // database. The database is fully namespaced and stores
    65  // image, container, namespace, snapshot, and content data
    66  // while proxying data shared across namespaces to backend
    67  // datastores for content and snapshots.
    68  type DB struct {
    69  	db *bolt.DB
    70  	ss map[string]*snapshotter
    71  	cs *contentStore
    72  
    73  	// wlock is used to protect access to the data structures during garbage
    74  	// collection. While the wlock is held no writable transactions can be
    75  	// opened, preventing changes from occurring between the mark and
    76  	// sweep phases without preventing read transactions.
    77  	wlock sync.RWMutex
    78  
    79  	// dirty flag indicates that references have been removed which require
    80  	// a garbage collection to ensure the database is clean. This tracks
    81  	// the number of dirty operations. This should be updated and read
    82  	// atomically if outside of wlock.Lock.
    83  	dirty uint32
    84  
    85  	// dirtySS and dirtyCS flags keeps track of datastores which have had
    86  	// deletions since the last garbage collection. These datastores will
    87  	// be garbage collected during the next garbage collection. These
    88  	// should only be updated inside of a write transaction or wlock.Lock.
    89  	dirtySS map[string]struct{}
    90  	dirtyCS bool
    91  
    92  	// mutationCallbacks are called after each mutation with the flag
    93  	// set indicating whether any dirty flags are set
    94  	mutationCallbacks []func(bool)
    95  
    96  	dbopts dbOptions
    97  }
    98  
    99  // NewDB creates a new metadata database using the provided
   100  // bolt database, content store, and snapshotters.
   101  func NewDB(db *bolt.DB, cs content.Store, ss map[string]snapshots.Snapshotter, opts ...DBOpt) *DB {
   102  	m := &DB{
   103  		db:      db,
   104  		ss:      make(map[string]*snapshotter, len(ss)),
   105  		dirtySS: map[string]struct{}{},
   106  		dbopts: dbOptions{
   107  			shared: true,
   108  		},
   109  	}
   110  
   111  	for _, opt := range opts {
   112  		opt(&m.dbopts)
   113  	}
   114  
   115  	// Initialize data stores
   116  	m.cs = newContentStore(m, m.dbopts.shared, cs)
   117  	for name, sn := range ss {
   118  		m.ss[name] = newSnapshotter(m, name, sn)
   119  	}
   120  
   121  	return m
   122  }
   123  
   124  // Init ensures the database is at the correct version
   125  // and performs any needed migrations.
   126  func (m *DB) Init(ctx context.Context) error {
   127  	// errSkip is used when no migration or version needs to be written
   128  	// to the database and the transaction can be immediately rolled
   129  	// back rather than performing a much slower and unnecessary commit.
   130  	var errSkip = errors.New("skip update")
   131  
   132  	err := m.db.Update(func(tx *bolt.Tx) error {
   133  		var (
   134  			// current schema and version
   135  			schema  = "v0"
   136  			version = 0
   137  		)
   138  
   139  		// i represents the index of the first migration
   140  		// which must be run to get the database up to date.
   141  		// The migration's version will be checked in reverse
   142  		// order, decrementing i for each migration which
   143  		// represents a version newer than the current
   144  		// database version
   145  		i := len(migrations)
   146  
   147  		for ; i > 0; i-- {
   148  			migration := migrations[i-1]
   149  
   150  			bkt := tx.Bucket([]byte(migration.schema))
   151  			if bkt == nil {
   152  				// Hasn't encountered another schema, go to next migration
   153  				if schema == "v0" {
   154  					continue
   155  				}
   156  				break
   157  			}
   158  			if schema == "v0" {
   159  				schema = migration.schema
   160  				vb := bkt.Get(bucketKeyDBVersion)
   161  				if vb != nil {
   162  					v, _ := binary.Varint(vb)
   163  					version = int(v)
   164  				}
   165  			}
   166  
   167  			if version >= migration.version {
   168  				break
   169  			}
   170  		}
   171  
   172  		// Previous version of database found
   173  		if schema != "v0" {
   174  			updates := migrations[i:]
   175  
   176  			// No migration updates, return immediately
   177  			if len(updates) == 0 {
   178  				return errSkip
   179  			}
   180  
   181  			for _, m := range updates {
   182  				t0 := time.Now()
   183  				if err := m.migrate(tx); err != nil {
   184  					return errors.Wrapf(err, "failed to migrate to %s.%d", m.schema, m.version)
   185  				}
   186  				log.G(ctx).WithField("d", time.Since(t0)).Debugf("finished database migration to %s.%d", m.schema, m.version)
   187  			}
   188  		}
   189  
   190  		bkt, err := tx.CreateBucketIfNotExists(bucketKeyVersion)
   191  		if err != nil {
   192  			return err
   193  		}
   194  
   195  		versionEncoded, err := encodeInt(dbVersion)
   196  		if err != nil {
   197  			return err
   198  		}
   199  
   200  		return bkt.Put(bucketKeyDBVersion, versionEncoded)
   201  	})
   202  	if err == errSkip {
   203  		err = nil
   204  	}
   205  	return err
   206  }
   207  
   208  // ContentStore returns a namespaced content store
   209  // proxied to a content store.
   210  func (m *DB) ContentStore() content.Store {
   211  	if m.cs == nil {
   212  		return nil
   213  	}
   214  	return m.cs
   215  }
   216  
   217  // Snapshotter returns a namespaced content store for
   218  // the requested snapshotter name proxied to a snapshotter.
   219  func (m *DB) Snapshotter(name string) snapshots.Snapshotter {
   220  	sn, ok := m.ss[name]
   221  	if !ok {
   222  		return nil
   223  	}
   224  	return sn
   225  }
   226  
   227  // Snapshotters returns all available snapshotters.
   228  func (m *DB) Snapshotters() map[string]snapshots.Snapshotter {
   229  	ss := make(map[string]snapshots.Snapshotter, len(m.ss))
   230  	for n, sn := range m.ss {
   231  		ss[n] = sn
   232  	}
   233  	return ss
   234  }
   235  
   236  // View runs a readonly transaction on the metadata store.
   237  func (m *DB) View(fn func(*bolt.Tx) error) error {
   238  	return m.db.View(fn)
   239  }
   240  
   241  // Update runs a writable transaction on the metadata store.
   242  func (m *DB) Update(fn func(*bolt.Tx) error) error {
   243  	m.wlock.RLock()
   244  	defer m.wlock.RUnlock()
   245  	err := m.db.Update(fn)
   246  	if err == nil {
   247  		dirty := atomic.LoadUint32(&m.dirty) > 0
   248  		for _, fn := range m.mutationCallbacks {
   249  			fn(dirty)
   250  		}
   251  	}
   252  
   253  	return err
   254  }
   255  
   256  // RegisterMutationCallback registers a function to be called after a metadata
   257  // mutations has been performed.
   258  //
   259  // The callback function is an argument for whether a deletion has occurred
   260  // since the last garbage collection.
   261  func (m *DB) RegisterMutationCallback(fn func(bool)) {
   262  	m.wlock.Lock()
   263  	m.mutationCallbacks = append(m.mutationCallbacks, fn)
   264  	m.wlock.Unlock()
   265  }
   266  
   267  // GCStats holds the duration for the different phases of the garbage collector
   268  type GCStats struct {
   269  	MetaD     time.Duration
   270  	ContentD  time.Duration
   271  	SnapshotD map[string]time.Duration
   272  }
   273  
   274  // Elapsed returns the duration which elapsed during a collection
   275  func (s GCStats) Elapsed() time.Duration {
   276  	return s.MetaD
   277  }
   278  
   279  // GarbageCollect starts garbage collection
   280  func (m *DB) GarbageCollect(ctx context.Context) (gc.Stats, error) {
   281  	m.wlock.Lock()
   282  	t1 := time.Now()
   283  
   284  	marked, err := m.getMarked(ctx)
   285  	if err != nil {
   286  		m.wlock.Unlock()
   287  		return nil, err
   288  	}
   289  
   290  	if err := m.db.Update(func(tx *bolt.Tx) error {
   291  		ctx, cancel := context.WithCancel(ctx)
   292  		defer cancel()
   293  
   294  		rm := func(ctx context.Context, n gc.Node) error {
   295  			if _, ok := marked[n]; ok {
   296  				return nil
   297  			}
   298  
   299  			if n.Type == ResourceSnapshot {
   300  				if idx := strings.IndexRune(n.Key, '/'); idx > 0 {
   301  					m.dirtySS[n.Key[:idx]] = struct{}{}
   302  				}
   303  			} else if n.Type == ResourceContent || n.Type == ResourceIngest {
   304  				m.dirtyCS = true
   305  			}
   306  			return remove(ctx, tx, n)
   307  		}
   308  
   309  		if err := scanAll(ctx, tx, rm); err != nil {
   310  			return errors.Wrap(err, "failed to scan and remove")
   311  		}
   312  
   313  		return nil
   314  	}); err != nil {
   315  		m.wlock.Unlock()
   316  		return nil, err
   317  	}
   318  
   319  	var stats GCStats
   320  	var wg sync.WaitGroup
   321  
   322  	// reset dirty, no need for atomic inside of wlock.Lock
   323  	m.dirty = 0
   324  
   325  	if len(m.dirtySS) > 0 {
   326  		var sl sync.Mutex
   327  		stats.SnapshotD = map[string]time.Duration{}
   328  		wg.Add(len(m.dirtySS))
   329  		for snapshotterName := range m.dirtySS {
   330  			log.G(ctx).WithField("snapshotter", snapshotterName).Debug("schedule snapshotter cleanup")
   331  			go func(snapshotterName string) {
   332  				st1 := time.Now()
   333  				m.cleanupSnapshotter(snapshotterName)
   334  
   335  				sl.Lock()
   336  				stats.SnapshotD[snapshotterName] = time.Since(st1)
   337  				sl.Unlock()
   338  
   339  				wg.Done()
   340  			}(snapshotterName)
   341  		}
   342  		m.dirtySS = map[string]struct{}{}
   343  	}
   344  
   345  	if m.dirtyCS {
   346  		wg.Add(1)
   347  		log.G(ctx).Debug("schedule content cleanup")
   348  		go func() {
   349  			ct1 := time.Now()
   350  			m.cleanupContent()
   351  			stats.ContentD = time.Since(ct1)
   352  			wg.Done()
   353  		}()
   354  		m.dirtyCS = false
   355  	}
   356  
   357  	stats.MetaD = time.Since(t1)
   358  	m.wlock.Unlock()
   359  
   360  	wg.Wait()
   361  
   362  	return stats, err
   363  }
   364  
   365  func (m *DB) getMarked(ctx context.Context) (map[gc.Node]struct{}, error) {
   366  	var marked map[gc.Node]struct{}
   367  	if err := m.db.View(func(tx *bolt.Tx) error {
   368  		ctx, cancel := context.WithCancel(ctx)
   369  		defer cancel()
   370  
   371  		var (
   372  			nodes []gc.Node
   373  			wg    sync.WaitGroup
   374  			roots = make(chan gc.Node)
   375  		)
   376  		wg.Add(1)
   377  		go func() {
   378  			defer wg.Done()
   379  			for n := range roots {
   380  				nodes = append(nodes, n)
   381  			}
   382  		}()
   383  		// Call roots
   384  		if err := scanRoots(ctx, tx, roots); err != nil {
   385  			cancel()
   386  			return err
   387  		}
   388  		close(roots)
   389  		wg.Wait()
   390  
   391  		refs := func(n gc.Node) ([]gc.Node, error) {
   392  			var sn []gc.Node
   393  			if err := references(ctx, tx, n, func(nn gc.Node) {
   394  				sn = append(sn, nn)
   395  			}); err != nil {
   396  				return nil, err
   397  			}
   398  			return sn, nil
   399  		}
   400  
   401  		reachable, err := gc.Tricolor(nodes, refs)
   402  		if err != nil {
   403  			return err
   404  		}
   405  		marked = reachable
   406  		return nil
   407  	}); err != nil {
   408  		return nil, err
   409  	}
   410  	return marked, nil
   411  }
   412  
   413  func (m *DB) cleanupSnapshotter(name string) (time.Duration, error) {
   414  	ctx := context.Background()
   415  	sn, ok := m.ss[name]
   416  	if !ok {
   417  		return 0, nil
   418  	}
   419  
   420  	d, err := sn.garbageCollect(ctx)
   421  	logger := log.G(ctx).WithField("snapshotter", name)
   422  	if err != nil {
   423  		logger.WithError(err).Warn("snapshot garbage collection failed")
   424  	} else {
   425  		logger.WithField("d", d).Debugf("snapshot garbage collected")
   426  	}
   427  	return d, err
   428  }
   429  
   430  func (m *DB) cleanupContent() (time.Duration, error) {
   431  	ctx := context.Background()
   432  	if m.cs == nil {
   433  		return 0, nil
   434  	}
   435  
   436  	d, err := m.cs.garbageCollect(ctx)
   437  	if err != nil {
   438  		log.G(ctx).WithError(err).Warn("content garbage collection failed")
   439  	} else {
   440  		log.G(ctx).WithField("d", d).Debugf("content garbage collected")
   441  	}
   442  
   443  	return d, err
   444  }