github.com/cilium/statedb@v0.3.2/db.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package statedb
     5  
     6  import (
     7  	"context"
     8  	"net/http"
     9  	"runtime"
    10  	"slices"
    11  	"sort"
    12  	"sync"
    13  	"sync/atomic"
    14  	"time"
    15  
    16  	"github.com/cilium/statedb/internal"
    17  )
    18  
    19  // DB provides an in-memory transaction database built on top of immutable radix
    20  // trees. The database supports multiple tables, each with one or more user-defined
    21  // indexes. Readers can access the data locklessly with a simple atomic pointer read
    22  // to obtain a snapshot. On writes to the database table-level locks are acquired
    23  // on target tables and on write transaction commit a root lock is taken to swap
    24  // in the new root with the modified tables.
    25  //
    26  // As data is stored in immutable data structures any objects inserted into
    27  // it MUST NOT be mutated afterwards.
    28  //
    29  // DB holds the "root" tree of tables with each table holding a tree of indexes:
    30  //
    31  //	           root
    32  //	          /    \
    33  //	         ba    T(foo)
    34  //	       /   \
    35  //	   T(bar)  T(baz)
    36  //
    37  //	      T(bar).indexes
    38  //		   /  \
    39  //		  i    I(byRevision)
    40  //		/   \
    41  //	   I(id)    I(ip)
    42  //
    43  //	          I(ip)
    44  //	          /  \
    45  //	        192  172
    46  //	        /     ...
    47  //	    bar(192.168.1.1)
    48  //
    49  // T = tableEntry
    50  // I = indexTree
    51  //
    52  // To lookup:
    53  //  1. Create a read (or write) transaction
    54  //  2. Find the table from the root tree
    55  //  3. Find the index from the table's index tree
    56  //  4. Find the object from the index
    57  //
    58  // To insert:
    59  //  1. Create write transaction against the target table
    60  //  2. Find the table from the root tree
    61  //  3. Create/reuse write transaction on primary index
    62  //  4. Insert/replace the object into primary index
    63  //  5. Create/reuse write transaction on revision index
    64  //  6. If old object existed, remove from revision index
    65  //  7. If old object existed, remove from graveyard
    66  //  8. Update each secondary index
    67  //  9. Commit transaction by committing each index to
    68  //     the table and then committing table to the root.
    69  //     Swap the root atomic pointer to new root and
    70  //     notify by closing channels of all modified nodes.
    71  //
    72  // To observe deletions:
    73  //  1. Create write transaction against the target table
    74  //  2. Create new delete tracker and add it to the table
    75  //  3. Commit the write transaction to update the table
    76  //     with the new delete tracker
    77  //  4. Query the graveyard by revision, starting from the
    78  //     revision of the write transaction at which it was
    79  //     created.
    80  //  5. For each successfully processed deletion, mark the
    81  //     revision to set low watermark for garbage collection.
    82  //  6. Periodically garbage collect the graveyard by finding
    83  //     the lowest revision of all delete trackers.
    84  type DB struct {
    85  	handleName string
    86  	*dbState
    87  }
    88  
    89  // dbState is the underlying state of the database shared by all [DB] handles.
    90  type dbState struct {
    91  	mu                  sync.Mutex // protects 'tables' and sequences modifications to the root tree
    92  	ctx                 context.Context
    93  	cancel              context.CancelFunc
    94  	root                atomic.Pointer[dbRoot]
    95  	gcTrigger           chan struct{} // trigger for graveyard garbage collection
    96  	gcExited            chan struct{}
    97  	gcRateLimitInterval time.Duration
    98  	metrics             Metrics
    99  }
   100  
   101  type dbRoot []tableEntry
   102  
   103  type Option func(*opts)
   104  
   105  type opts struct {
   106  	metrics Metrics
   107  }
   108  
   109  func WithMetrics(m Metrics) Option {
   110  	return func(o *opts) {
   111  		o.metrics = m
   112  	}
   113  }
   114  
   115  // New creates a new database.
   116  //
   117  // The created database must be started and stopped!
   118  func New(options ...Option) *DB {
   119  	var opts opts
   120  	for _, o := range options {
   121  		o(&opts)
   122  	}
   123  	if opts.metrics == nil {
   124  		// Use the default metrics implementation but don't publish it.
   125  		opts.metrics = NewExpVarMetrics(false)
   126  	}
   127  
   128  	db := &DB{
   129  		dbState: &dbState{
   130  			metrics:             opts.metrics,
   131  			gcRateLimitInterval: defaultGCRateLimitInterval,
   132  		},
   133  	}
   134  	db.handleName = "DB"
   135  	root := dbRoot{}
   136  	db.root.Store(&root)
   137  	return db
   138  }
   139  
   140  // RegisterTable registers a table to the database:
   141  //
   142  //	func NewMyTable() statedb.RWTable[MyTable] { ... }
   143  //	cell.Provide(NewMyTable),
   144  //	cell.Invoke(statedb.RegisterTable[MyTable]),
   145  func RegisterTable[Obj any](db *DB, table RWTable[Obj]) error {
   146  	return db.RegisterTable(table)
   147  }
   148  
   149  // RegisterTable registers a table to the database.
   150  func (db *DB) RegisterTable(table TableMeta, tables ...TableMeta) error {
   151  	db.mu.Lock()
   152  	defer db.mu.Unlock()
   153  
   154  	root := slices.Clone(*db.root.Load())
   155  
   156  	if err := db.registerTable(table, &root); err != nil {
   157  		return err
   158  	}
   159  	for _, t := range tables {
   160  		if err := db.registerTable(t, &root); err != nil {
   161  			return err
   162  		}
   163  	}
   164  	db.root.Store(&root)
   165  	return nil
   166  }
   167  
   168  func (db *DB) registerTable(table TableMeta, root *dbRoot) error {
   169  	name := table.Name()
   170  	for _, t := range *root {
   171  		if t.meta.Name() == name {
   172  			return tableError(name, ErrDuplicateTable)
   173  		}
   174  	}
   175  
   176  	pos := len(*root)
   177  	table.setTablePos(pos)
   178  	*root = append(*root, table.tableEntry())
   179  	return nil
   180  }
   181  
   182  // ReadTxn constructs a new read transaction for performing reads against
   183  // a snapshot of the database.
   184  //
   185  // The returned ReadTxn is not thread-safe.
   186  func (db *DB) ReadTxn() ReadTxn {
   187  	return &txn{
   188  		db:   db,
   189  		root: *db.root.Load(),
   190  	}
   191  }
   192  
   193  // WriteTxn constructs a new write transaction against the given set of tables.
   194  // Each table is locked, which may block until the table locks are acquired.
   195  // The modifications performed in the write transaction are not visible outside
   196  // it until Commit() is called. To discard the changes call Abort().
   197  //
   198  // The returned WriteTxn is not thread-safe.
   199  func (db *DB) WriteTxn(table TableMeta, tables ...TableMeta) WriteTxn {
   200  	allTables := append(tables, table)
   201  	smus := internal.SortableMutexes{}
   202  	for _, table := range allTables {
   203  		smus = append(smus, table.sortableMutex())
   204  		if table.tablePos() < 0 {
   205  			panic(tableError(table.Name(), ErrTableNotRegistered))
   206  		}
   207  	}
   208  	lockAt := time.Now()
   209  	smus.Lock()
   210  	acquiredAt := time.Now()
   211  	root := *db.root.Load()
   212  	tableEntries := make([]*tableEntry, len(root))
   213  
   214  	txn := &txn{
   215  		db:         db,
   216  		root:       root,
   217  		handle:     db.handleName,
   218  		acquiredAt: time.Now(),
   219  		writeTxn: writeTxn{
   220  			modifiedTables: tableEntries,
   221  			smus:           smus,
   222  		},
   223  	}
   224  
   225  	var tableNames []string
   226  	for _, table := range allTables {
   227  		tableEntry := root[table.tablePos()]
   228  		tableEntry.indexes = slices.Clone(tableEntry.indexes)
   229  		tableEntries[table.tablePos()] = &tableEntry
   230  		tableNames = append(tableNames, table.Name())
   231  
   232  		db.metrics.WriteTxnTableAcquisition(
   233  			db.handleName,
   234  			table.Name(),
   235  			table.sortableMutex().AcquireDuration(),
   236  		)
   237  		table.acquired(txn)
   238  	}
   239  
   240  	// Sort the table names so they always appear ordered in metrics.
   241  	sort.Strings(tableNames)
   242  	txn.tableNames = tableNames
   243  
   244  	db.metrics.WriteTxnTotalAcquisition(
   245  		db.handleName,
   246  		tableNames,
   247  		acquiredAt.Sub(lockAt),
   248  	)
   249  
   250  	runtime.SetFinalizer(txn, txnFinalizer)
   251  	return txn
   252  }
   253  
   254  func (db *DB) GetTables(txn ReadTxn) (tbls []TableMeta) {
   255  	root := txn.getTxn().root
   256  	tbls = make([]TableMeta, 0, len(root))
   257  	for _, table := range root {
   258  		tbls = append(tbls, table.meta)
   259  	}
   260  	return
   261  }
   262  
   263  func (db *DB) GetTable(txn ReadTxn, name string) TableMeta {
   264  	root := txn.getTxn().root
   265  	for _, table := range root {
   266  		if table.meta.Name() == name {
   267  			return table.meta
   268  		}
   269  	}
   270  	return nil
   271  }
   272  
   273  // Start the background workers for the database.
   274  //
   275  // This starts the graveyard worker that deals with garbage collecting
   276  // deleted objects that are no longer necessary for Changes().
   277  func (db *DB) Start() error {
   278  	db.gcTrigger = make(chan struct{}, 1)
   279  	db.gcExited = make(chan struct{})
   280  	db.ctx, db.cancel = context.WithCancel(context.Background())
   281  	go graveyardWorker(db, db.ctx, db.gcRateLimitInterval)
   282  	return nil
   283  }
   284  
   285  // Stop the background workers.
   286  func (db *DB) Stop() error {
   287  	db.cancel()
   288  	<-db.gcExited
   289  	return nil
   290  }
   291  
   292  // ServeHTTP is an HTTP handler for dumping StateDB as JSON.
   293  //
   294  // Example usage:
   295  //
   296  //	var db *statedb.DB
   297  //
   298  //	http.Handle("/db", db)
   299  //	http.ListenAndServe(":8080", nil)
   300  func (db *DB) ServeHTTP(w http.ResponseWriter, r *http.Request) {
   301  	w.Header().Add("Content-Type", "application/json")
   302  	w.WriteHeader(http.StatusOK)
   303  	db.ReadTxn().WriteJSON(w)
   304  }
   305  
   306  // setGCRateLimitInterval can set the graveyard GC interval before DB is started.
   307  // Used by tests.
   308  func (db *DB) setGCRateLimitInterval(interval time.Duration) {
   309  	db.gcRateLimitInterval = interval
   310  }
   311  
   312  // NewHandle returns a new named handle to the DB. The given name is used to annotate
   313  // metrics.
   314  func (db *DB) NewHandle(name string) *DB {
   315  	return &DB{
   316  		handleName: name,
   317  		dbState:    db.dbState,
   318  	}
   319  }