github.com/wtsi-ssg/wrstat@v1.1.4-0.20221008232152-3030622a8cf8/cmd/basedir.go (about)

     1  /*******************************************************************************
     2   * Copyright (c) 2022 Genome Research Ltd.
     3   *
     4   * Author: Sendu Bala <sb10@sanger.ac.uk>
     5   *
     6   * Permission is hereby granted, free of charge, to any person obtaining
     7   * a copy of this software and associated documentation files (the
     8   * "Software"), to deal in the Software without restriction, including
     9   * without limitation the rights to use, copy, modify, merge, publish,
    10   * distribute, sublicense, and/or sell copies of the Software, and to
    11   * permit persons to whom the Software is furnished to do so, subject to
    12   * the following conditions:
    13   *
    14   * The above copyright notice and this permission notice shall be included
    15   * in all copies or substantial portions of the Software.
    16   *
    17   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    18   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    19   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
    20   * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
    21   * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    22   * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
    23   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    24   ******************************************************************************/
    25  
    26  package cmd
    27  
    28  import (
    29  	"fmt"
    30  	"os"
    31  	"path/filepath"
    32  	"regexp"
    33  	"strings"
    34  	"time"
    35  
    36  	"github.com/spf13/cobra"
    37  	"github.com/wtsi-ssg/wrstat/dgut"
    38  )
    39  
    40  const (
    41  	basedirBasename   = "base.dirs"
    42  	basedirSplits     = 4
    43  	basedirMinDirs    = 4
    44  	basedirMinDirsMDT = 5
    45  )
    46  
    47  var basedirMDTRegexp = regexp.MustCompile(`\/mdt\d(\/|\z)`)
    48  
    49  // basedirCmd represents the basedir command.
    50  var basedirCmd = &cobra.Command{
    51  	Use:   "basedir",
    52  	Short: "Calculate base directories for every unix group.",
    53  	Long: `Calculate base directories for every unix group.
    54  
    55  Provide the unique subdirectory of your 'wrstat multi -w' directory as an unamed
    56  argument to this command.
    57  
    58  This is called by 'wrstat multi' after the combine step has completed. It does
    59  some 'wrstat where'-type calls for every unix group to come up with hopefully
    60  meaningful and useful "base directories" for every group.
    61  
    62  Unlike the real 'wrstat where', this is not restricted by authorization and
    63  directly accesses the database files to see all data.
    64  
    65  A base directory is a directory where all a group's data lies nested within.
    66  
    67  Since a group could have files in multiple mount points mounted at /, the true
    68  base directory would likely always be '/', which wouldn't be useful. Instead,
    69  a 'wrstat where' split of 4 is used, and only paths consisting of at least 4
    70  sub directories are returned. Paths that are subdirectories of other results are
    71  ignored. As a special case, if a path contains 'mdt[n]' as a directory, where n
    72  is a number, then 5 sub directories are required.
    73  
    74  The output file format is 2 tab separated columns with the following contents:
    75  1. Unix group ID.
    76  2. Absolute path of base directory.
    77  
    78  The output file has the name 'base.dirs' in the given directory.`,
    79  	Run: func(cmd *cobra.Command, args []string) {
    80  		if len(args) != 1 {
    81  			die("you must supply the path to your unique subdir of your 'wrstat multi -w' working directory")
    82  		}
    83  
    84  		t := time.Now()
    85  		tree, err := dgut.NewTree(dgutDBCombinePaths(args[0])...)
    86  		if err != nil {
    87  			die("failed to load dgut databases: %s", err)
    88  		}
    89  		info("opening databases took %s", time.Since(t))
    90  
    91  		t = time.Now()
    92  		gids, err := getAllGIDsInTree(tree)
    93  		if err != nil {
    94  			die("failed to get all unix groups: %s", err)
    95  		}
    96  		info("getting GIDs took %s", time.Since(t))
    97  
    98  		t = time.Now()
    99  		err = calculateBaseDirs(tree, filepath.Join(args[0], basedirBasename), gids)
   100  		if err != nil {
   101  			die("failed to create base.dirs: %s", err)
   102  		}
   103  		info("calculating base dirs took %s", time.Since(t))
   104  	},
   105  }
   106  
   107  func init() {
   108  	RootCmd.AddCommand(basedirCmd)
   109  }
   110  
   111  // dgutDBCombinePaths returns the dgut db directories that 'wrstat combine'
   112  // creates in the given output directory.
   113  func dgutDBCombinePaths(dir string) []string {
   114  	paths, err := filepath.Glob(fmt.Sprintf("%s/*/*/%s", dir, combineDGUTOutputFileBasename))
   115  	if err != nil || len(paths) == 0 {
   116  		die("failed to find dgut database directories based on [%s/*/*/%s] (err: %s)",
   117  			dir, combineDGUTOutputFileBasename, err)
   118  	}
   119  
   120  	return paths
   121  }
   122  
   123  // getAllGIDsInTree gets all the unix group IDs that own files in the given file
   124  // tree.
   125  func getAllGIDsInTree(tree *dgut.Tree) ([]uint32, error) {
   126  	di, err := tree.DirInfo("/", nil)
   127  	if err != nil {
   128  		return nil, err
   129  	}
   130  
   131  	return di.Current.GIDs, nil
   132  }
   133  
   134  // calculateBaseDirs does the main work of this command.
   135  func calculateBaseDirs(tree *dgut.Tree, outPath string, gids []uint32) error {
   136  	outFile, err := os.Create(outPath)
   137  	if err != nil {
   138  		return err
   139  	}
   140  
   141  	for _, gid := range gids {
   142  		baseDirs, errc := calculateBaseDirsOfGID(tree, gid)
   143  		if errc != nil {
   144  			return errc
   145  		}
   146  
   147  		if errw := writeBaseDirsOfGID(outFile, gid, baseDirs); errw != nil {
   148  			return errw
   149  		}
   150  	}
   151  
   152  	if err = outFile.Close(); err != nil {
   153  		return err
   154  	}
   155  
   156  	destDirInfo, err := os.Stat(filepath.Dir(outPath))
   157  	if err != nil {
   158  		return err
   159  	}
   160  
   161  	return matchPerms(outPath, destDirInfo)
   162  }
   163  
   164  // calculateBaseDirsOfGID uses the tree to work out what the base directories of
   165  // the given GID are. We manipulate Where() results instead of using
   166  // FileLocations(), because empirically that is too noisy.
   167  func calculateBaseDirsOfGID(tree *dgut.Tree, gid uint32) ([]string, error) {
   168  	dcss, err := tree.Where("/", &dgut.Filter{GIDs: []uint32{gid}}, basedirSplits)
   169  	if err != nil {
   170  		return nil, err
   171  	}
   172  
   173  	dcss.SortByDir()
   174  
   175  	var dirs []string //nolint:prealloc
   176  
   177  	var previous string
   178  
   179  	for _, ds := range dcss {
   180  		if notEnoughDirs(ds.Dir) || childOfPreviousResult(ds.Dir, previous) {
   181  			continue
   182  		}
   183  
   184  		dirs = append(dirs, ds.Dir)
   185  		previous = ds.Dir
   186  	}
   187  
   188  	return dirs, nil
   189  }
   190  
   191  // notEnoughDirs returns true if the given path has fewer than 4 directories.
   192  // If path has an mdt directory in it, then it becomes 5 directories.
   193  func notEnoughDirs(path string) bool {
   194  	numDirs := strings.Count(path, "/")
   195  
   196  	min := basedirMinDirs
   197  	if basedirMDTRegexp.MatchString(path) {
   198  		min = basedirMinDirsMDT
   199  	}
   200  
   201  	return numDirs < min
   202  }
   203  
   204  // childOfPreviousResult returns true if previous is not blank, and dir starts
   205  // with it.
   206  func childOfPreviousResult(dir, previous string) bool {
   207  	return previous != "" && strings.HasPrefix(dir, previous)
   208  }
   209  
   210  // writeBaseDirsOfGID writes entries to the output file for the given gid and
   211  // its base directories.
   212  func writeBaseDirsOfGID(outFile *os.File, gid uint32, dirs []string) error {
   213  	for _, dir := range dirs {
   214  		if _, err := outFile.WriteString(fmt.Sprintf("%d\t%s\n", gid, dir)); err != nil {
   215  			return err
   216  		}
   217  	}
   218  
   219  	return nil
   220  }