github.com/cilium/statedb@v0.3.2/db.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package statedb 5 6 import ( 7 "context" 8 "net/http" 9 "runtime" 10 "slices" 11 "sort" 12 "sync" 13 "sync/atomic" 14 "time" 15 16 "github.com/cilium/statedb/internal" 17 ) 18 19 // DB provides an in-memory transaction database built on top of immutable radix 20 // trees. The database supports multiple tables, each with one or more user-defined 21 // indexes. Readers can access the data locklessly with a simple atomic pointer read 22 // to obtain a snapshot. On writes to the database table-level locks are acquired 23 // on target tables and on write transaction commit a root lock is taken to swap 24 // in the new root with the modified tables. 25 // 26 // As data is stored in immutable data structures any objects inserted into 27 // it MUST NOT be mutated afterwards. 28 // 29 // DB holds the "root" tree of tables with each table holding a tree of indexes: 30 // 31 // root 32 // / \ 33 // ba T(foo) 34 // / \ 35 // T(bar) T(baz) 36 // 37 // T(bar).indexes 38 // / \ 39 // i I(byRevision) 40 // / \ 41 // I(id) I(ip) 42 // 43 // I(ip) 44 // / \ 45 // 192 172 46 // / ... 47 // bar(192.168.1.1) 48 // 49 // T = tableEntry 50 // I = indexTree 51 // 52 // To lookup: 53 // 1. Create a read (or write) transaction 54 // 2. Find the table from the root tree 55 // 3. Find the index from the table's index tree 56 // 4. Find the object from the index 57 // 58 // To insert: 59 // 1. Create write transaction against the target table 60 // 2. Find the table from the root tree 61 // 3. Create/reuse write transaction on primary index 62 // 4. Insert/replace the object into primary index 63 // 5. Create/reuse write transaction on revision index 64 // 6. If old object existed, remove from revision index 65 // 7. If old object existed, remove from graveyard 66 // 8. Update each secondary index 67 // 9. Commit transaction by committing each index to 68 // the table and then committing table to the root. 69 // Swap the root atomic pointer to new root and 70 // notify by closing channels of all modified nodes. 71 // 72 // To observe deletions: 73 // 1. Create write transaction against the target table 74 // 2. Create new delete tracker and add it to the table 75 // 3. Commit the write transaction to update the table 76 // with the new delete tracker 77 // 4. Query the graveyard by revision, starting from the 78 // revision of the write transaction at which it was 79 // created. 80 // 5. For each successfully processed deletion, mark the 81 // revision to set low watermark for garbage collection. 82 // 6. Periodically garbage collect the graveyard by finding 83 // the lowest revision of all delete trackers. 84 type DB struct { 85 handleName string 86 *dbState 87 } 88 89 // dbState is the underlying state of the database shared by all [DB] handles. 90 type dbState struct { 91 mu sync.Mutex // protects 'tables' and sequences modifications to the root tree 92 ctx context.Context 93 cancel context.CancelFunc 94 root atomic.Pointer[dbRoot] 95 gcTrigger chan struct{} // trigger for graveyard garbage collection 96 gcExited chan struct{} 97 gcRateLimitInterval time.Duration 98 metrics Metrics 99 } 100 101 type dbRoot []tableEntry 102 103 type Option func(*opts) 104 105 type opts struct { 106 metrics Metrics 107 } 108 109 func WithMetrics(m Metrics) Option { 110 return func(o *opts) { 111 o.metrics = m 112 } 113 } 114 115 // New creates a new database. 116 // 117 // The created database must be started and stopped! 118 func New(options ...Option) *DB { 119 var opts opts 120 for _, o := range options { 121 o(&opts) 122 } 123 if opts.metrics == nil { 124 // Use the default metrics implementation but don't publish it. 125 opts.metrics = NewExpVarMetrics(false) 126 } 127 128 db := &DB{ 129 dbState: &dbState{ 130 metrics: opts.metrics, 131 gcRateLimitInterval: defaultGCRateLimitInterval, 132 }, 133 } 134 db.handleName = "DB" 135 root := dbRoot{} 136 db.root.Store(&root) 137 return db 138 } 139 140 // RegisterTable registers a table to the database: 141 // 142 // func NewMyTable() statedb.RWTable[MyTable] { ... } 143 // cell.Provide(NewMyTable), 144 // cell.Invoke(statedb.RegisterTable[MyTable]), 145 func RegisterTable[Obj any](db *DB, table RWTable[Obj]) error { 146 return db.RegisterTable(table) 147 } 148 149 // RegisterTable registers a table to the database. 150 func (db *DB) RegisterTable(table TableMeta, tables ...TableMeta) error { 151 db.mu.Lock() 152 defer db.mu.Unlock() 153 154 root := slices.Clone(*db.root.Load()) 155 156 if err := db.registerTable(table, &root); err != nil { 157 return err 158 } 159 for _, t := range tables { 160 if err := db.registerTable(t, &root); err != nil { 161 return err 162 } 163 } 164 db.root.Store(&root) 165 return nil 166 } 167 168 func (db *DB) registerTable(table TableMeta, root *dbRoot) error { 169 name := table.Name() 170 for _, t := range *root { 171 if t.meta.Name() == name { 172 return tableError(name, ErrDuplicateTable) 173 } 174 } 175 176 pos := len(*root) 177 table.setTablePos(pos) 178 *root = append(*root, table.tableEntry()) 179 return nil 180 } 181 182 // ReadTxn constructs a new read transaction for performing reads against 183 // a snapshot of the database. 184 // 185 // The returned ReadTxn is not thread-safe. 186 func (db *DB) ReadTxn() ReadTxn { 187 return &txn{ 188 db: db, 189 root: *db.root.Load(), 190 } 191 } 192 193 // WriteTxn constructs a new write transaction against the given set of tables. 194 // Each table is locked, which may block until the table locks are acquired. 195 // The modifications performed in the write transaction are not visible outside 196 // it until Commit() is called. To discard the changes call Abort(). 197 // 198 // The returned WriteTxn is not thread-safe. 199 func (db *DB) WriteTxn(table TableMeta, tables ...TableMeta) WriteTxn { 200 allTables := append(tables, table) 201 smus := internal.SortableMutexes{} 202 for _, table := range allTables { 203 smus = append(smus, table.sortableMutex()) 204 if table.tablePos() < 0 { 205 panic(tableError(table.Name(), ErrTableNotRegistered)) 206 } 207 } 208 lockAt := time.Now() 209 smus.Lock() 210 acquiredAt := time.Now() 211 root := *db.root.Load() 212 tableEntries := make([]*tableEntry, len(root)) 213 214 txn := &txn{ 215 db: db, 216 root: root, 217 handle: db.handleName, 218 acquiredAt: time.Now(), 219 writeTxn: writeTxn{ 220 modifiedTables: tableEntries, 221 smus: smus, 222 }, 223 } 224 225 var tableNames []string 226 for _, table := range allTables { 227 tableEntry := root[table.tablePos()] 228 tableEntry.indexes = slices.Clone(tableEntry.indexes) 229 tableEntries[table.tablePos()] = &tableEntry 230 tableNames = append(tableNames, table.Name()) 231 232 db.metrics.WriteTxnTableAcquisition( 233 db.handleName, 234 table.Name(), 235 table.sortableMutex().AcquireDuration(), 236 ) 237 table.acquired(txn) 238 } 239 240 // Sort the table names so they always appear ordered in metrics. 241 sort.Strings(tableNames) 242 txn.tableNames = tableNames 243 244 db.metrics.WriteTxnTotalAcquisition( 245 db.handleName, 246 tableNames, 247 acquiredAt.Sub(lockAt), 248 ) 249 250 runtime.SetFinalizer(txn, txnFinalizer) 251 return txn 252 } 253 254 func (db *DB) GetTables(txn ReadTxn) (tbls []TableMeta) { 255 root := txn.getTxn().root 256 tbls = make([]TableMeta, 0, len(root)) 257 for _, table := range root { 258 tbls = append(tbls, table.meta) 259 } 260 return 261 } 262 263 func (db *DB) GetTable(txn ReadTxn, name string) TableMeta { 264 root := txn.getTxn().root 265 for _, table := range root { 266 if table.meta.Name() == name { 267 return table.meta 268 } 269 } 270 return nil 271 } 272 273 // Start the background workers for the database. 274 // 275 // This starts the graveyard worker that deals with garbage collecting 276 // deleted objects that are no longer necessary for Changes(). 277 func (db *DB) Start() error { 278 db.gcTrigger = make(chan struct{}, 1) 279 db.gcExited = make(chan struct{}) 280 db.ctx, db.cancel = context.WithCancel(context.Background()) 281 go graveyardWorker(db, db.ctx, db.gcRateLimitInterval) 282 return nil 283 } 284 285 // Stop the background workers. 286 func (db *DB) Stop() error { 287 db.cancel() 288 <-db.gcExited 289 return nil 290 } 291 292 // ServeHTTP is an HTTP handler for dumping StateDB as JSON. 293 // 294 // Example usage: 295 // 296 // var db *statedb.DB 297 // 298 // http.Handle("/db", db) 299 // http.ListenAndServe(":8080", nil) 300 func (db *DB) ServeHTTP(w http.ResponseWriter, r *http.Request) { 301 w.Header().Add("Content-Type", "application/json") 302 w.WriteHeader(http.StatusOK) 303 db.ReadTxn().WriteJSON(w) 304 } 305 306 // setGCRateLimitInterval can set the graveyard GC interval before DB is started. 307 // Used by tests. 308 func (db *DB) setGCRateLimitInterval(interval time.Duration) { 309 db.gcRateLimitInterval = interval 310 } 311 312 // NewHandle returns a new named handle to the DB. The given name is used to annotate 313 // metrics. 314 func (db *DB) NewHandle(name string) *DB { 315 return &DB{ 316 handleName: name, 317 dbState: db.dbState, 318 } 319 }