github.com/wtsi-ssg/wrstat@v1.1.4-0.20221008232152-3030622a8cf8/dgut/tree.go (about)

     1  /*******************************************************************************
     2   * Copyright (c) 2022 Genome Research Ltd.
     3   *
     4   * Author: Sendu Bala <sb10@sanger.ac.uk>
     5   *
     6   * Permission is hereby granted, free of charge, to any person obtaining
     7   * a copy of this software and associated documentation files (the
     8   * "Software"), to deal in the Software without restriction, including
     9   * without limitation the rights to use, copy, modify, merge, publish,
    10   * distribute, sublicense, and/or sell copies of the Software, and to
    11   * permit persons to whom the Software is furnished to do so, subject to
    12   * the following conditions:
    13   *
    14   * The above copyright notice and this permission notice shall be included
    15   * in all copies or substantial portions of the Software.
    16   *
    17   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    18   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    19   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
    20   * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
    21   * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    22   * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
    23   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    24   ******************************************************************************/
    25  
    26  package dgut
    27  
    28  import (
    29  	"sort"
    30  	"time"
    31  
    32  	"github.com/wtsi-ssg/wrstat/summary"
    33  )
    34  
    35  // Tree is used to do high-level queries on DB.Store() database files.
    36  type Tree struct {
    37  	db *DB
    38  }
    39  
    40  // NewTree, given the paths to one or more dgut database files (as created by
    41  // DB.Store()), returns a *Tree that can be used to do high-level queries on the
    42  // stats of a tree of disk folders. You should Close() the tree after use.
    43  func NewTree(paths ...string) (*Tree, error) {
    44  	db := NewDB(paths...)
    45  
    46  	if err := db.Open(); err != nil {
    47  		return nil, err
    48  	}
    49  
    50  	return &Tree{db: db}, nil
    51  }
    52  
    53  // DirSummary holds nested file count, size and atime information on a
    54  // directory. It also holds which users and groups own files nested under the
    55  // directory, and what the file types are.
    56  type DirSummary struct {
    57  	Dir   string
    58  	Count uint64
    59  	Size  uint64
    60  	Atime time.Time
    61  	UIDs  []uint32
    62  	GIDs  []uint32
    63  	FTs   []summary.DirGUTFileType
    64  }
    65  
    66  // DCSs is a Size-sortable slice of DirSummary.
    67  type DCSs []*DirSummary
    68  
    69  func (d DCSs) Len() int {
    70  	return len(d)
    71  }
    72  func (d DCSs) Swap(i, j int) {
    73  	d[i], d[j] = d[j], d[i]
    74  }
    75  func (d DCSs) Less(i, j int) bool {
    76  	return d[i].Size > d[j].Size
    77  }
    78  
    79  // SortByDir sorts by Dir instead of Size.
    80  func (d DCSs) SortByDir() {
    81  	sort.Slice(d, func(i, j int) bool {
    82  		return d[i].Dir < d[j].Dir
    83  	})
    84  }
    85  
    86  // DirInfo holds nested file count, size, UID and GID information on a
    87  // directory, and also its immediate child directories.
    88  type DirInfo struct {
    89  	Current  *DirSummary
    90  	Children []*DirSummary
    91  }
    92  
    93  // IsSameAsChild tells you if this DirInfo has only 1 child, and the child
    94  // has the same file count. Ie. our child contains the same files as us.
    95  func (d *DirInfo) IsSameAsChild() bool {
    96  	return len(d.Children) == 1 && d.Children[0].Count == d.Current.Count
    97  }
    98  
    99  // DirInfo tells you the total number of files and their total size nested under
   100  // the given directory, along with the UIDs and GIDs that own those files.
   101  // See GUTs.Summary for an explanation of the filter.
   102  //
   103  // It also tells you the same information about the immediate child directories
   104  // of the given directory (if the children have files in them that pass the
   105  // filter).
   106  //
   107  // Returns an error if dir doesn't exist.
   108  func (t *Tree) DirInfo(dir string, filter *Filter) (*DirInfo, error) {
   109  	dcs, err := t.getSummaryInfo(dir, filter)
   110  	if err != nil {
   111  		return nil, err
   112  	}
   113  
   114  	di := &DirInfo{
   115  		Current: dcs,
   116  	}
   117  
   118  	children := t.db.Children(di.Current.Dir)
   119  	err = t.addChildInfo(di, children, filter)
   120  
   121  	return di, err
   122  }
   123  
   124  // DirHasChildren tells you if the given directory has any child directories
   125  // with files in them that pass the filter. See GUTs.Summary for an explanation
   126  // of the filter.
   127  func (t *Tree) DirHasChildren(dir string, filter *Filter) bool {
   128  	children := t.db.Children(dir)
   129  
   130  	for _, child := range children {
   131  		ds, _ := t.getSummaryInfo(child, filter) //nolint:errcheck
   132  
   133  		if ds.Count > 0 {
   134  			return true
   135  		}
   136  	}
   137  
   138  	return false
   139  }
   140  
   141  // getSummaryInfo accesses the database to retrieve the count, size and atime
   142  // info for a given directory and filter, along with the UIDs and GIDs that own
   143  // those files, the file types of those files.
   144  func (t *Tree) getSummaryInfo(dir string, filter *Filter) (*DirSummary, error) {
   145  	c, s, a, u, g, fts, err := t.db.DirInfo(dir, filter)
   146  	if err != nil {
   147  		return nil, err
   148  	}
   149  
   150  	return &DirSummary{
   151  		Dir:   dir,
   152  		Count: c,
   153  		Size:  s,
   154  		Atime: time.Unix(a, 0),
   155  		UIDs:  u,
   156  		GIDs:  g,
   157  		FTs:   fts,
   158  	}, nil
   159  }
   160  
   161  // addChildInfo adds DirSummary info of the given child paths to the di's
   162  // Children. If a child dir has no files in it, it is ignored.
   163  func (t *Tree) addChildInfo(di *DirInfo, children []string, filter *Filter) error {
   164  	for _, child := range children {
   165  		dcs, errc := t.getSummaryInfo(child, filter)
   166  		if errc != nil {
   167  			return errc
   168  		}
   169  
   170  		if dcs.Count > 0 {
   171  			di.Children = append(di.Children, dcs)
   172  		}
   173  	}
   174  
   175  	return nil
   176  }
   177  
   178  // Where tells you where files are nested under dir that pass the filter. With a
   179  // depth of 0 it only returns the single deepest directory that has all passing
   180  // files nested under it.
   181  //
   182  // With a depth of 1, it also returns the results that calling Where() with a
   183  // depth of 0 on each of the deepest directory's children would give. And so on
   184  // recursively for higher depths.
   185  //
   186  // See GUTs.Summary for an explanation of the filter.
   187  //
   188  // For example, if all user 354's files are in the directories /a/b/c/d (2
   189  // files), /a/b/c/d/1 (1 files), /a/b/c/d/2 (2 files) and /a/b/e/f/g (2 files),
   190  // Where("/", &Filter{UIDs: []uint32{354}}, 0) would tell you that "/a/b" has 7
   191  // files. With a depth of 1 it would tell you that "/a/b" has 7 files,
   192  // "/a/b/c/d" has 5 files and "/a/b/e/f/g" has 2 files. With a depth of 2 it
   193  // would tell you that "/a/b" has 7 files, "/a/b/c/d" has 5 files, "/a/b/c/d/1"
   194  // has 1 file, "/a/b/c/d/2" has 2 files, and "/a/b/e/f/g" has 2 files.
   195  //
   196  // The returned DirSummarys are sorted by Size, largest first.
   197  //
   198  // Returns an error if dir doesn't exist.
   199  func (t *Tree) Where(dir string, filter *Filter, depth int) (DCSs, error) {
   200  	var dcss DCSs
   201  
   202  	di, err := t.where0(dir, filter)
   203  	if err != nil {
   204  		return nil, err
   205  	}
   206  
   207  	dcss = append(dcss, di.Current)
   208  
   209  	children := di.Children
   210  
   211  	for i := 0; i < depth; i++ {
   212  		var theseChildren []*DirSummary
   213  
   214  		for _, dcs := range children {
   215  			// where0 can't return an error here, because we're supplying it a
   216  			// directory name that came from the database.
   217  			//nolint:errcheck
   218  			diChild, _ := t.where0(dcs.Dir, filter)
   219  			dcss = append(dcss, diChild.Current)
   220  			theseChildren = append(theseChildren, diChild.Children...)
   221  		}
   222  
   223  		children = theseChildren
   224  	}
   225  
   226  	sort.Sort(dcss)
   227  
   228  	return dcss, nil
   229  }
   230  
   231  // where0 is the implementation of Where() for a depth of 0.
   232  func (t *Tree) where0(dir string, filter *Filter) (*DirInfo, error) {
   233  	di, err := t.DirInfo(dir, filter)
   234  	if err != nil {
   235  		return nil, err
   236  	}
   237  
   238  	for di.IsSameAsChild() {
   239  		// DirInfo can't return an error here, because we're supplying it a
   240  		// directory name that came from the database.
   241  		//nolint:errcheck
   242  		di, _ = t.DirInfo(di.Children[0].Dir, filter)
   243  	}
   244  
   245  	return di, nil
   246  }
   247  
   248  // FileLocations, starting from the given dir, finds the first directory that
   249  // directly contains filter-passing files along every branch from dir.
   250  //
   251  // See GUTs.Summary for an explanation of the filter.
   252  //
   253  // The results are returned sorted by directory.
   254  func (t *Tree) FileLocations(dir string, filter *Filter) (DCSs, error) {
   255  	var dcss DCSs
   256  
   257  	di, err := t.DirInfo(dir, filter)
   258  	if err != nil {
   259  		return nil, err
   260  	}
   261  
   262  	var childCount uint64
   263  
   264  	for _, child := range di.Children {
   265  		childCount += child.Count
   266  	}
   267  
   268  	if childCount < di.Current.Count {
   269  		dcss = append(dcss, di.Current)
   270  
   271  		return dcss, nil
   272  	}
   273  
   274  	for _, child := range di.Children {
   275  		// FileLocations can't return an error here, because we're supplying it
   276  		// a directory name that came from the database.
   277  		//nolint:errcheck
   278  		childDCSs, _ := t.FileLocations(child.Dir, filter)
   279  		dcss = append(dcss, childDCSs...)
   280  	}
   281  
   282  	dcss.SortByDir()
   283  
   284  	return dcss, nil
   285  }
   286  
   287  // Close should be called after you've finished querying the tree to release its
   288  // database locks.
   289  func (t *Tree) Close() {
   290  	if t.db != nil {
   291  		t.db.Close()
   292  	}
   293  }