github.com/wtsi-ssg/wrstat/v4@v4.5.1/dgut/db.go (about) 1 /******************************************************************************* 2 * Copyright (c) 2022 Genome Research Ltd. 3 * 4 * Author: Sendu Bala <sb10@sanger.ac.uk> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining 7 * a copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sublicense, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included 15 * in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 20 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 21 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 ******************************************************************************/ 25 26 package dgut 27 28 import ( 29 "io" 30 "os" 31 "path/filepath" 32 "sort" 33 "syscall" 34 35 "github.com/hashicorp/go-multierror" 36 "github.com/ugorji/go/codec" 37 "github.com/wtsi-ssg/wrstat/v4/summary" 38 bolt "go.etcd.io/bbolt" 39 ) 40 41 const ( 42 gutBucket = "gut" 43 childBucket = "children" 44 dbBasenameDGUT = "dgut.db" 45 dbBasenameChildren = dbBasenameDGUT + ".children" 46 dbOpenMode = 0600 47 ) 48 49 const ErrDBExists = Error("database already exists") 50 const ErrDBNotExists = Error("database doesn't exist") 51 const ErrDirNotFound = Error("directory not found") 52 53 // a dbSet is 2 databases, one for storing DGUTs, one for storing children. 54 type dbSet struct { 55 dir string 56 dguts *bolt.DB 57 children *bolt.DB 58 } 59 60 // newDBSet creates a new newDBSet that knows where its database files are 61 // located or should be created. 62 func newDBSet(dir string) *dbSet { 63 return &dbSet{ 64 dir: dir, 65 } 66 } 67 68 // Create creates new database files in our directory. Returns an error if those 69 // files already exist. 70 func (s *dbSet) Create() error { 71 paths := s.paths() 72 73 if s.pathsExist(paths) { 74 return ErrDBExists 75 } 76 77 db, err := openBoltWritable(paths[0], gutBucket) 78 if err != nil { 79 return err 80 } 81 82 s.dguts = db 83 84 db, err = openBoltWritable(paths[1], childBucket) 85 s.children = db 86 87 return err 88 } 89 90 // paths returns the expected paths for our dgut and children databases 91 // respectively. 92 func (s *dbSet) paths() []string { 93 return []string{ 94 filepath.Join(s.dir, dbBasenameDGUT), 95 filepath.Join(s.dir, dbBasenameChildren), 96 } 97 } 98 99 // pathsExist tells you if the databases at the given paths already exist. 100 func (s *dbSet) pathsExist(paths []string) bool { 101 for _, path := range paths { 102 info, err := os.Stat(path) 103 if err == nil && info.Size() != 0 { 104 return true 105 } 106 } 107 108 return false 109 } 110 111 // openBoltWritable creates a new database at the given path with the given 112 // bucket inside. 113 func openBoltWritable(path, bucket string) (*bolt.DB, error) { 114 db, err := bolt.Open(path, dbOpenMode, &bolt.Options{ 115 NoFreelistSync: true, 116 NoGrowSync: true, 117 FreelistType: bolt.FreelistMapType, 118 }) 119 if err != nil { 120 return nil, err 121 } 122 123 err = db.Update(func(tx *bolt.Tx) error { 124 _, errc := tx.CreateBucketIfNotExists([]byte(bucket)) 125 126 return errc 127 }) 128 129 return db, err 130 } 131 132 // Open opens our constituent databases read-only. 133 func (s *dbSet) Open() error { 134 paths := s.paths() 135 136 db, err := openBoltReadOnly(paths[0]) 137 if err != nil { 138 return err 139 } 140 141 s.dguts = db 142 143 db, err = openBoltReadOnly(paths[1]) 144 if err != nil { 145 return err 146 } 147 148 s.children = db 149 150 return nil 151 } 152 153 // openBoltReadOnly opens a bolt database at the given path in read-only mode. 154 func openBoltReadOnly(path string) (*bolt.DB, error) { 155 return bolt.Open(path, dbOpenMode, &bolt.Options{ 156 ReadOnly: true, 157 MmapFlags: syscall.MAP_POPULATE, 158 }) 159 } 160 161 // Close closes our constituent databases. 162 func (s *dbSet) Close() error { 163 var errm *multierror.Error 164 165 err := s.dguts.Close() 166 errm = multierror.Append(errm, err) 167 168 err = s.children.Close() 169 errm = multierror.Append(errm, err) 170 171 return errm.ErrorOrNil() 172 } 173 174 // DB is used to create and query a database made from a dgut file, which is the 175 // directory,group,user,type summary output produced by the summary packages' 176 // DirGroupUserType.Output() method. 177 type DB struct { 178 paths []string 179 writeSet *dbSet 180 readSets []*dbSet 181 batchSize int 182 writeBatch []*DGUT 183 writeI int 184 writeErr error 185 ch codec.Handle 186 } 187 188 // NewDB returns a *DB that can be used to create or query a dgut database. 189 // Provide the path to directory that (will) store(s) the database files. In the 190 // case of only reading databases with Open(), you can supply multiple directory 191 // paths to query all of them simultaneously. 192 func NewDB(paths ...string) *DB { 193 return &DB{paths: paths} 194 } 195 196 // Store will read the given dgut file data (as output by 197 // summary.DirGroupUserType.Output()) and store it in 2 database files that 198 // offer fast lookup of the information by directory. 199 // 200 // The path for the database directory you provided to NewDB() (only the first 201 // will be used) must not already have database files in it to create a new 202 // database. You can't add to an existing database. If you create multiple sets 203 // of data to store, instead Store them to individual database directories, and 204 // then load all them together during Open(). 205 // 206 // batchSize is how many directories worth of information are written to the 207 // database in one go. More is faster, but uses more memory. 10,000 might be a 208 // good number to try. 209 func (d *DB) Store(data io.Reader, batchSize int) error { 210 d.batchSize = batchSize 211 212 err := d.createDB() 213 if err != nil { 214 return err 215 } 216 217 defer func() { 218 errc := d.writeSet.Close() 219 if err == nil { 220 err = errc 221 } 222 }() 223 224 if err = d.storeData(data); err != nil { 225 return err 226 } 227 228 if d.writeBatch[0] != nil { 229 d.storeBatch() 230 } 231 232 err = d.writeErr 233 234 return err 235 } 236 237 // createDB creates a new database set, but only if it doesn't already exist. 238 func (d *DB) createDB() error { 239 set := newDBSet(d.paths[0]) 240 241 err := set.Create() 242 if err != nil { 243 return err 244 } 245 246 d.writeSet = set 247 d.ch = new(codec.BincHandle) 248 249 return err 250 } 251 252 // storeData parses the data and stores it in our database file. Only call this 253 // after calling createDB(), and only call it once. 254 func (d *DB) storeData(data io.Reader) error { 255 d.resetBatch() 256 257 return parseDGUTLines(data, d.parserCB) 258 } 259 260 // resetBatch prepares us to receive a new batch of DGUTs from the parser. 261 func (d *DB) resetBatch() { 262 d.writeBatch = make([]*DGUT, d.batchSize) 263 d.writeI = 0 264 } 265 266 // parserCB is a dgutParserCallBack that is called during parsing of dgut file 267 // data. It batches up the DGUTs we receive, and writes them to the database 268 // when a batch is full. 269 func (d *DB) parserCB(dgut *DGUT) { 270 d.writeBatch[d.writeI] = dgut 271 d.writeI++ 272 273 if d.writeI == d.batchSize { 274 d.storeBatch() 275 d.resetBatch() 276 } 277 } 278 279 // storeBatch writes the current batch of DGUTs to the database. It also updates 280 // our dir->child lookup in the database. 281 func (d *DB) storeBatch() { 282 if d.writeErr != nil { 283 return 284 } 285 286 var errm *multierror.Error 287 288 err := d.writeSet.children.Update(d.storeChildren) 289 errm = multierror.Append(errm, err) 290 291 err = d.writeSet.dguts.Update(d.storeDGUTs) 292 errm = multierror.Append(errm, err) 293 294 err = errm.ErrorOrNil() 295 if err != nil { 296 d.writeErr = err 297 } 298 } 299 300 // storeChildren stores the Dirs of the current DGUT batch in the db. 301 func (d *DB) storeChildren(txn *bolt.Tx) error { 302 b := txn.Bucket([]byte(childBucket)) 303 304 parentToChildren := d.calculateChildrenOfParents(b) 305 306 for parent, children := range parentToChildren { 307 if err := b.Put([]byte(parent), d.encodeChildren(children)); err != nil { 308 return err 309 } 310 } 311 312 return nil 313 } 314 315 // calculateChildrenOfParents works out what the children of every parent 316 // directory of every dgut.Dir is in the current writeBatch. Returns a map 317 // of parent keys and children slice value. 318 func (d *DB) calculateChildrenOfParents(b *bolt.Bucket) map[string][]string { 319 parentToChildren := make(map[string][]string) 320 321 for _, dgut := range d.writeBatch { 322 if dgut == nil { 323 continue 324 } 325 326 d.storeChildrenOfParentInMap(b, dgut.Dir, parentToChildren) 327 } 328 329 return parentToChildren 330 } 331 332 // storeChildrenOfParentInMap gets current children of child's parent in the db 333 // and stores them in the store map, then once stored in the map, appends this 334 // child to the parent's children. 335 func (d *DB) storeChildrenOfParentInMap(b *bolt.Bucket, child string, store map[string][]string) { 336 if child == "/" { 337 return 338 } 339 340 parent := filepath.Dir(child) 341 342 var children []string 343 344 if storedChildren, stored := store[parent]; stored { 345 children = storedChildren 346 } else { 347 children = d.getChildrenFromDB(b, parent) 348 } 349 350 children = append(children, child) 351 352 store[parent] = children 353 } 354 355 // getChildrenFromDB retrieves the child directory values associated with the 356 // given directory key in the given db. Returns an empty slice if the dir wasn't 357 // found. 358 func (d *DB) getChildrenFromDB(b *bolt.Bucket, dir string) []string { 359 v := b.Get([]byte(dir)) 360 if v == nil { 361 return []string{} 362 } 363 364 return d.decodeChildrenBytes(v) 365 } 366 367 // decodeChildBytes converts the byte slice returned by encodeChildren() back 368 // in to a []string. 369 func (d *DB) decodeChildrenBytes(encoded []byte) []string { 370 dec := codec.NewDecoderBytes(encoded, d.ch) 371 372 var children []string 373 374 dec.MustDecode(&children) 375 376 return children 377 } 378 379 // encodeChildren returns converts the given string slice into a []byte suitable 380 // for storing on disk. 381 func (d *DB) encodeChildren(dirs []string) []byte { 382 var encoded []byte 383 enc := codec.NewEncoderBytes(&encoded, d.ch) 384 enc.MustEncode(dirs) 385 386 return encoded 387 } 388 389 // storeDGUTs stores the current batch of DGUTs in the db. 390 func (d *DB) storeDGUTs(tx *bolt.Tx) error { 391 b := tx.Bucket([]byte(gutBucket)) 392 393 for _, dgut := range d.writeBatch { 394 if dgut == nil { 395 return nil 396 } 397 398 if err := d.storeDGUT(b, dgut); err != nil { 399 return err 400 } 401 } 402 403 return nil 404 } 405 406 // storeDGUT stores a DGUT in the db. DGUTs are expected to be unique per 407 // Store() operation and database. 408 func (d *DB) storeDGUT(b *bolt.Bucket, dgut *DGUT) error { 409 dir, guts := dgut.encodeToBytes(d.ch) 410 411 return b.Put(dir, guts) 412 } 413 414 // Open opens the database(s) for reading. You need to call this before using 415 // the query methods like DirInfo() and Which(). You should call Close() after 416 // you've finished. 417 func (d *DB) Open() error { 418 readSets := make([]*dbSet, len(d.paths)) 419 420 for i, path := range d.paths { 421 readSet := newDBSet(path) 422 423 if !readSet.pathsExist(readSet.paths()) { 424 return ErrDBNotExists 425 } 426 427 err := readSet.Open() 428 if err != nil { 429 return err 430 } 431 432 readSets[i] = readSet 433 } 434 435 d.readSets = readSets 436 437 d.ch = new(codec.BincHandle) 438 439 return nil 440 } 441 442 // Close closes the database(s) after reading. You should call this once 443 // you've finished reading, but it's not necessary; errors are ignored. 444 func (d *DB) Close() { 445 if d.readSets == nil { 446 return 447 } 448 449 for _, readSet := range d.readSets { 450 readSet.Close() 451 } 452 } 453 454 // DirInfo tells you the total number of files, their total size, oldest atime 455 // and newset mtime nested under the given directory, along with the UIDs, GIDs 456 // and FTs of those files. See GUTs.Summary for an explanation of the filter. 457 // 458 // Returns an error if dir doesn't exist. 459 // 460 // You must call Open() before calling this. 461 func (d *DB) DirInfo(dir string, filter *Filter) (uint64, uint64, int64, int64, 462 []uint32, []uint32, []summary.DirGUTFileType, error) { 463 var notFound int 464 465 dgut := &DGUT{} 466 467 for _, readSet := range d.readSets { 468 if err := readSet.dguts.View(func(tx *bolt.Tx) error { 469 b := tx.Bucket([]byte(gutBucket)) 470 471 return getDGUTFromDBAndAppend(b, dir, d.ch, dgut) 472 }); err != nil { 473 notFound++ 474 } 475 } 476 477 if notFound == len(d.readSets) { 478 return 0, 0, 0, 0, nil, nil, nil, ErrDirNotFound 479 } 480 481 c, s, a, m, u, g, t := dgut.Summary(filter) 482 483 return c, s, a, m, u, g, t, nil 484 } 485 486 // getDGUTFromDBAndAppend calls getDGUTFromDB() and appends the result 487 // to the given dgut. If the given dgut is empty, it will be populated with the 488 // content of the result instead. 489 func getDGUTFromDBAndAppend(b *bolt.Bucket, dir string, ch codec.Handle, dgut *DGUT) error { 490 thisDGUT, err := getDGUTFromDB(b, dir, ch) 491 if err != nil { 492 return err 493 } 494 495 if dgut.Dir == "" { 496 dgut.Dir = thisDGUT.Dir 497 dgut.GUTs = thisDGUT.GUTs 498 } else { 499 dgut.Append(thisDGUT) 500 } 501 502 return nil 503 } 504 505 // getDGUTFromDB gets and decodes a dgut from the given database. 506 func getDGUTFromDB(b *bolt.Bucket, dir string, ch codec.Handle) (*DGUT, error) { 507 bdir := []byte(dir) 508 509 v := b.Get(bdir) 510 if v == nil { 511 return nil, ErrDirNotFound 512 } 513 514 dgut := decodeDGUTbytes(ch, bdir, v) 515 516 return dgut, nil 517 } 518 519 // Children returns the directory paths that are directly inside the given 520 // directory. 521 // 522 // Returns an empty slice if dir had no children (because it was a leaf dir, 523 // or didn't exist at all). 524 // 525 // The same children from multiple databases are de-duplicated. 526 // 527 // You must call Open() before calling this. 528 func (d *DB) Children(dir string) []string { 529 children := make(map[string]bool) 530 531 for _, readSet := range d.readSets { 532 // no error is possible here, but the View function requires we return 533 // one. 534 //nolint:errcheck 535 readSet.children.View(func(tx *bolt.Tx) error { 536 b := tx.Bucket([]byte(childBucket)) 537 538 for _, child := range d.getChildrenFromDB(b, dir) { 539 children[child] = true 540 } 541 542 return nil 543 }) 544 } 545 546 return mapToSortedKeys(children) 547 } 548 549 // mapToSortedKeys takes the keys from the given map and returns them as a 550 // sorted slice. If map length is 0, returns nil. 551 func mapToSortedKeys(things map[string]bool) []string { 552 if len(things) == 0 { 553 return nil 554 } 555 556 keys := make([]string, len(things)) 557 i := 0 558 559 for thing := range things { 560 keys[i] = thing 561 i++ 562 } 563 564 sort.Strings(keys) 565 566 return keys 567 }