github.com/wtsi-ssg/wrstat/v4@v4.5.1/dgut/db.go (about)

     1  /*******************************************************************************
     2   * Copyright (c) 2022 Genome Research Ltd.
     3   *
     4   * Author: Sendu Bala <sb10@sanger.ac.uk>
     5   *
     6   * Permission is hereby granted, free of charge, to any person obtaining
     7   * a copy of this software and associated documentation files (the
     8   * "Software"), to deal in the Software without restriction, including
     9   * without limitation the rights to use, copy, modify, merge, publish,
    10   * distribute, sublicense, and/or sell copies of the Software, and to
    11   * permit persons to whom the Software is furnished to do so, subject to
    12   * the following conditions:
    13   *
    14   * The above copyright notice and this permission notice shall be included
    15   * in all copies or substantial portions of the Software.
    16   *
    17   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    18   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    19   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
    20   * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
    21   * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    22   * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
    23   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    24   ******************************************************************************/
    25  
    26  package dgut
    27  
    28  import (
    29  	"io"
    30  	"os"
    31  	"path/filepath"
    32  	"sort"
    33  	"syscall"
    34  
    35  	"github.com/hashicorp/go-multierror"
    36  	"github.com/ugorji/go/codec"
    37  	"github.com/wtsi-ssg/wrstat/v4/summary"
    38  	bolt "go.etcd.io/bbolt"
    39  )
    40  
    41  const (
    42  	gutBucket          = "gut"
    43  	childBucket        = "children"
    44  	dbBasenameDGUT     = "dgut.db"
    45  	dbBasenameChildren = dbBasenameDGUT + ".children"
    46  	dbOpenMode         = 0600
    47  )
    48  
    49  const ErrDBExists = Error("database already exists")
    50  const ErrDBNotExists = Error("database doesn't exist")
    51  const ErrDirNotFound = Error("directory not found")
    52  
    53  // a dbSet is 2 databases, one for storing DGUTs, one for storing children.
    54  type dbSet struct {
    55  	dir      string
    56  	dguts    *bolt.DB
    57  	children *bolt.DB
    58  }
    59  
    60  // newDBSet creates a new newDBSet that knows where its database files are
    61  // located or should be created.
    62  func newDBSet(dir string) *dbSet {
    63  	return &dbSet{
    64  		dir: dir,
    65  	}
    66  }
    67  
    68  // Create creates new database files in our directory. Returns an error if those
    69  // files already exist.
    70  func (s *dbSet) Create() error {
    71  	paths := s.paths()
    72  
    73  	if s.pathsExist(paths) {
    74  		return ErrDBExists
    75  	}
    76  
    77  	db, err := openBoltWritable(paths[0], gutBucket)
    78  	if err != nil {
    79  		return err
    80  	}
    81  
    82  	s.dguts = db
    83  
    84  	db, err = openBoltWritable(paths[1], childBucket)
    85  	s.children = db
    86  
    87  	return err
    88  }
    89  
    90  // paths returns the expected paths for our dgut and children databases
    91  // respectively.
    92  func (s *dbSet) paths() []string {
    93  	return []string{
    94  		filepath.Join(s.dir, dbBasenameDGUT),
    95  		filepath.Join(s.dir, dbBasenameChildren),
    96  	}
    97  }
    98  
    99  // pathsExist tells you if the databases at the given paths already exist.
   100  func (s *dbSet) pathsExist(paths []string) bool {
   101  	for _, path := range paths {
   102  		info, err := os.Stat(path)
   103  		if err == nil && info.Size() != 0 {
   104  			return true
   105  		}
   106  	}
   107  
   108  	return false
   109  }
   110  
   111  // openBoltWritable creates a new database at the given path with the given
   112  // bucket inside.
   113  func openBoltWritable(path, bucket string) (*bolt.DB, error) {
   114  	db, err := bolt.Open(path, dbOpenMode, &bolt.Options{
   115  		NoFreelistSync: true,
   116  		NoGrowSync:     true,
   117  		FreelistType:   bolt.FreelistMapType,
   118  	})
   119  	if err != nil {
   120  		return nil, err
   121  	}
   122  
   123  	err = db.Update(func(tx *bolt.Tx) error {
   124  		_, errc := tx.CreateBucketIfNotExists([]byte(bucket))
   125  
   126  		return errc
   127  	})
   128  
   129  	return db, err
   130  }
   131  
   132  // Open opens our constituent databases read-only.
   133  func (s *dbSet) Open() error {
   134  	paths := s.paths()
   135  
   136  	db, err := openBoltReadOnly(paths[0])
   137  	if err != nil {
   138  		return err
   139  	}
   140  
   141  	s.dguts = db
   142  
   143  	db, err = openBoltReadOnly(paths[1])
   144  	if err != nil {
   145  		return err
   146  	}
   147  
   148  	s.children = db
   149  
   150  	return nil
   151  }
   152  
   153  // openBoltReadOnly opens a bolt database at the given path in read-only mode.
   154  func openBoltReadOnly(path string) (*bolt.DB, error) {
   155  	return bolt.Open(path, dbOpenMode, &bolt.Options{
   156  		ReadOnly:  true,
   157  		MmapFlags: syscall.MAP_POPULATE,
   158  	})
   159  }
   160  
   161  // Close closes our constituent databases.
   162  func (s *dbSet) Close() error {
   163  	var errm *multierror.Error
   164  
   165  	err := s.dguts.Close()
   166  	errm = multierror.Append(errm, err)
   167  
   168  	err = s.children.Close()
   169  	errm = multierror.Append(errm, err)
   170  
   171  	return errm.ErrorOrNil()
   172  }
   173  
   174  // DB is used to create and query a database made from a dgut file, which is the
   175  // directory,group,user,type summary output produced by the summary packages'
   176  // DirGroupUserType.Output() method.
   177  type DB struct {
   178  	paths      []string
   179  	writeSet   *dbSet
   180  	readSets   []*dbSet
   181  	batchSize  int
   182  	writeBatch []*DGUT
   183  	writeI     int
   184  	writeErr   error
   185  	ch         codec.Handle
   186  }
   187  
   188  // NewDB returns a *DB that can be used to create or query a dgut database.
   189  // Provide the path to directory that (will) store(s) the database files. In the
   190  // case of only reading databases with Open(), you can supply multiple directory
   191  // paths to query all of them simultaneously.
   192  func NewDB(paths ...string) *DB {
   193  	return &DB{paths: paths}
   194  }
   195  
   196  // Store will read the given dgut file data (as output by
   197  // summary.DirGroupUserType.Output()) and store it in 2 database files that
   198  // offer fast lookup of the information by directory.
   199  //
   200  // The path for the database directory you provided to NewDB() (only the first
   201  // will be used) must not already have database files in it to create a new
   202  // database. You can't add to an existing database. If you create multiple sets
   203  // of data to store, instead Store them to individual database directories, and
   204  // then load all them together during Open().
   205  //
   206  // batchSize is how many directories worth of information are written to the
   207  // database in one go. More is faster, but uses more memory. 10,000 might be a
   208  // good number to try.
   209  func (d *DB) Store(data io.Reader, batchSize int) error {
   210  	d.batchSize = batchSize
   211  
   212  	err := d.createDB()
   213  	if err != nil {
   214  		return err
   215  	}
   216  
   217  	defer func() {
   218  		errc := d.writeSet.Close()
   219  		if err == nil {
   220  			err = errc
   221  		}
   222  	}()
   223  
   224  	if err = d.storeData(data); err != nil {
   225  		return err
   226  	}
   227  
   228  	if d.writeBatch[0] != nil {
   229  		d.storeBatch()
   230  	}
   231  
   232  	err = d.writeErr
   233  
   234  	return err
   235  }
   236  
   237  // createDB creates a new database set, but only if it doesn't already exist.
   238  func (d *DB) createDB() error {
   239  	set := newDBSet(d.paths[0])
   240  
   241  	err := set.Create()
   242  	if err != nil {
   243  		return err
   244  	}
   245  
   246  	d.writeSet = set
   247  	d.ch = new(codec.BincHandle)
   248  
   249  	return err
   250  }
   251  
   252  // storeData parses the data and stores it in our database file. Only call this
   253  // after calling createDB(), and only call it once.
   254  func (d *DB) storeData(data io.Reader) error {
   255  	d.resetBatch()
   256  
   257  	return parseDGUTLines(data, d.parserCB)
   258  }
   259  
   260  // resetBatch prepares us to receive a new batch of DGUTs from the parser.
   261  func (d *DB) resetBatch() {
   262  	d.writeBatch = make([]*DGUT, d.batchSize)
   263  	d.writeI = 0
   264  }
   265  
   266  // parserCB is a dgutParserCallBack that is called during parsing of dgut file
   267  // data. It batches up the DGUTs we receive, and writes them to the database
   268  // when a batch is full.
   269  func (d *DB) parserCB(dgut *DGUT) {
   270  	d.writeBatch[d.writeI] = dgut
   271  	d.writeI++
   272  
   273  	if d.writeI == d.batchSize {
   274  		d.storeBatch()
   275  		d.resetBatch()
   276  	}
   277  }
   278  
   279  // storeBatch writes the current batch of DGUTs to the database. It also updates
   280  // our dir->child lookup in the database.
   281  func (d *DB) storeBatch() {
   282  	if d.writeErr != nil {
   283  		return
   284  	}
   285  
   286  	var errm *multierror.Error
   287  
   288  	err := d.writeSet.children.Update(d.storeChildren)
   289  	errm = multierror.Append(errm, err)
   290  
   291  	err = d.writeSet.dguts.Update(d.storeDGUTs)
   292  	errm = multierror.Append(errm, err)
   293  
   294  	err = errm.ErrorOrNil()
   295  	if err != nil {
   296  		d.writeErr = err
   297  	}
   298  }
   299  
   300  // storeChildren stores the Dirs of the current DGUT batch in the db.
   301  func (d *DB) storeChildren(txn *bolt.Tx) error {
   302  	b := txn.Bucket([]byte(childBucket))
   303  
   304  	parentToChildren := d.calculateChildrenOfParents(b)
   305  
   306  	for parent, children := range parentToChildren {
   307  		if err := b.Put([]byte(parent), d.encodeChildren(children)); err != nil {
   308  			return err
   309  		}
   310  	}
   311  
   312  	return nil
   313  }
   314  
   315  // calculateChildrenOfParents works out what the children of every parent
   316  // directory of every dgut.Dir is in the current writeBatch. Returns a map
   317  // of parent keys and children slice value.
   318  func (d *DB) calculateChildrenOfParents(b *bolt.Bucket) map[string][]string {
   319  	parentToChildren := make(map[string][]string)
   320  
   321  	for _, dgut := range d.writeBatch {
   322  		if dgut == nil {
   323  			continue
   324  		}
   325  
   326  		d.storeChildrenOfParentInMap(b, dgut.Dir, parentToChildren)
   327  	}
   328  
   329  	return parentToChildren
   330  }
   331  
   332  // storeChildrenOfParentInMap gets current children of child's parent in the db
   333  // and stores them in the store map, then once stored in the map, appends this
   334  // child to the parent's children.
   335  func (d *DB) storeChildrenOfParentInMap(b *bolt.Bucket, child string, store map[string][]string) {
   336  	if child == "/" {
   337  		return
   338  	}
   339  
   340  	parent := filepath.Dir(child)
   341  
   342  	var children []string
   343  
   344  	if storedChildren, stored := store[parent]; stored {
   345  		children = storedChildren
   346  	} else {
   347  		children = d.getChildrenFromDB(b, parent)
   348  	}
   349  
   350  	children = append(children, child)
   351  
   352  	store[parent] = children
   353  }
   354  
   355  // getChildrenFromDB retrieves the child directory values associated with the
   356  // given directory key in the given db. Returns an empty slice if the dir wasn't
   357  // found.
   358  func (d *DB) getChildrenFromDB(b *bolt.Bucket, dir string) []string {
   359  	v := b.Get([]byte(dir))
   360  	if v == nil {
   361  		return []string{}
   362  	}
   363  
   364  	return d.decodeChildrenBytes(v)
   365  }
   366  
   367  // decodeChildBytes converts the byte slice returned by encodeChildren() back
   368  // in to a []string.
   369  func (d *DB) decodeChildrenBytes(encoded []byte) []string {
   370  	dec := codec.NewDecoderBytes(encoded, d.ch)
   371  
   372  	var children []string
   373  
   374  	dec.MustDecode(&children)
   375  
   376  	return children
   377  }
   378  
   379  // encodeChildren returns converts the given string slice into a []byte suitable
   380  // for storing on disk.
   381  func (d *DB) encodeChildren(dirs []string) []byte {
   382  	var encoded []byte
   383  	enc := codec.NewEncoderBytes(&encoded, d.ch)
   384  	enc.MustEncode(dirs)
   385  
   386  	return encoded
   387  }
   388  
   389  // storeDGUTs stores the current batch of DGUTs in the db.
   390  func (d *DB) storeDGUTs(tx *bolt.Tx) error {
   391  	b := tx.Bucket([]byte(gutBucket))
   392  
   393  	for _, dgut := range d.writeBatch {
   394  		if dgut == nil {
   395  			return nil
   396  		}
   397  
   398  		if err := d.storeDGUT(b, dgut); err != nil {
   399  			return err
   400  		}
   401  	}
   402  
   403  	return nil
   404  }
   405  
   406  // storeDGUT stores a DGUT in the db. DGUTs are expected to be unique per
   407  // Store() operation and database.
   408  func (d *DB) storeDGUT(b *bolt.Bucket, dgut *DGUT) error {
   409  	dir, guts := dgut.encodeToBytes(d.ch)
   410  
   411  	return b.Put(dir, guts)
   412  }
   413  
   414  // Open opens the database(s) for reading. You need to call this before using
   415  // the query methods like DirInfo() and Which(). You should call Close() after
   416  // you've finished.
   417  func (d *DB) Open() error {
   418  	readSets := make([]*dbSet, len(d.paths))
   419  
   420  	for i, path := range d.paths {
   421  		readSet := newDBSet(path)
   422  
   423  		if !readSet.pathsExist(readSet.paths()) {
   424  			return ErrDBNotExists
   425  		}
   426  
   427  		err := readSet.Open()
   428  		if err != nil {
   429  			return err
   430  		}
   431  
   432  		readSets[i] = readSet
   433  	}
   434  
   435  	d.readSets = readSets
   436  
   437  	d.ch = new(codec.BincHandle)
   438  
   439  	return nil
   440  }
   441  
   442  // Close closes the database(s) after reading. You should call this once
   443  // you've finished reading, but it's not necessary; errors are ignored.
   444  func (d *DB) Close() {
   445  	if d.readSets == nil {
   446  		return
   447  	}
   448  
   449  	for _, readSet := range d.readSets {
   450  		readSet.Close()
   451  	}
   452  }
   453  
   454  // DirInfo tells you the total number of files, their total size, oldest atime
   455  // and newset mtime nested under the given directory, along with the UIDs, GIDs
   456  // and FTs of those files. See GUTs.Summary for an explanation of the filter.
   457  //
   458  // Returns an error if dir doesn't exist.
   459  //
   460  // You must call Open() before calling this.
   461  func (d *DB) DirInfo(dir string, filter *Filter) (uint64, uint64, int64, int64,
   462  	[]uint32, []uint32, []summary.DirGUTFileType, error) {
   463  	var notFound int
   464  
   465  	dgut := &DGUT{}
   466  
   467  	for _, readSet := range d.readSets {
   468  		if err := readSet.dguts.View(func(tx *bolt.Tx) error {
   469  			b := tx.Bucket([]byte(gutBucket))
   470  
   471  			return getDGUTFromDBAndAppend(b, dir, d.ch, dgut)
   472  		}); err != nil {
   473  			notFound++
   474  		}
   475  	}
   476  
   477  	if notFound == len(d.readSets) {
   478  		return 0, 0, 0, 0, nil, nil, nil, ErrDirNotFound
   479  	}
   480  
   481  	c, s, a, m, u, g, t := dgut.Summary(filter)
   482  
   483  	return c, s, a, m, u, g, t, nil
   484  }
   485  
   486  // getDGUTFromDBAndAppend calls getDGUTFromDB() and appends the result
   487  // to the given dgut. If the given dgut is empty, it will be populated with the
   488  // content of the result instead.
   489  func getDGUTFromDBAndAppend(b *bolt.Bucket, dir string, ch codec.Handle, dgut *DGUT) error {
   490  	thisDGUT, err := getDGUTFromDB(b, dir, ch)
   491  	if err != nil {
   492  		return err
   493  	}
   494  
   495  	if dgut.Dir == "" {
   496  		dgut.Dir = thisDGUT.Dir
   497  		dgut.GUTs = thisDGUT.GUTs
   498  	} else {
   499  		dgut.Append(thisDGUT)
   500  	}
   501  
   502  	return nil
   503  }
   504  
   505  // getDGUTFromDB gets and decodes a dgut from the given database.
   506  func getDGUTFromDB(b *bolt.Bucket, dir string, ch codec.Handle) (*DGUT, error) {
   507  	bdir := []byte(dir)
   508  
   509  	v := b.Get(bdir)
   510  	if v == nil {
   511  		return nil, ErrDirNotFound
   512  	}
   513  
   514  	dgut := decodeDGUTbytes(ch, bdir, v)
   515  
   516  	return dgut, nil
   517  }
   518  
   519  // Children returns the directory paths that are directly inside the given
   520  // directory.
   521  //
   522  // Returns an empty slice if dir had no children (because it was a leaf dir,
   523  // or didn't exist at all).
   524  //
   525  // The same children from multiple databases are de-duplicated.
   526  //
   527  // You must call Open() before calling this.
   528  func (d *DB) Children(dir string) []string {
   529  	children := make(map[string]bool)
   530  
   531  	for _, readSet := range d.readSets {
   532  		// no error is possible here, but the View function requires we return
   533  		// one.
   534  		//nolint:errcheck
   535  		readSet.children.View(func(tx *bolt.Tx) error {
   536  			b := tx.Bucket([]byte(childBucket))
   537  
   538  			for _, child := range d.getChildrenFromDB(b, dir) {
   539  				children[child] = true
   540  			}
   541  
   542  			return nil
   543  		})
   544  	}
   545  
   546  	return mapToSortedKeys(children)
   547  }
   548  
   549  // mapToSortedKeys takes the keys from the given map and returns them as a
   550  // sorted slice. If map length is 0, returns nil.
   551  func mapToSortedKeys(things map[string]bool) []string {
   552  	if len(things) == 0 {
   553  		return nil
   554  	}
   555  
   556  	keys := make([]string, len(things))
   557  	i := 0
   558  
   559  	for thing := range things {
   560  		keys[i] = thing
   561  		i++
   562  	}
   563  
   564  	sort.Strings(keys)
   565  
   566  	return keys
   567  }