github.com/wtsi-ssg/wrstat@v1.1.4-0.20221008232152-3030622a8cf8/cmd/basedir.go (about) 1 /******************************************************************************* 2 * Copyright (c) 2022 Genome Research Ltd. 3 * 4 * Author: Sendu Bala <sb10@sanger.ac.uk> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining 7 * a copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sublicense, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included 15 * in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 20 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 21 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 ******************************************************************************/ 25 26 package cmd 27 28 import ( 29 "fmt" 30 "os" 31 "path/filepath" 32 "regexp" 33 "strings" 34 "time" 35 36 "github.com/spf13/cobra" 37 "github.com/wtsi-ssg/wrstat/dgut" 38 ) 39 40 const ( 41 basedirBasename = "base.dirs" 42 basedirSplits = 4 43 basedirMinDirs = 4 44 basedirMinDirsMDT = 5 45 ) 46 47 var basedirMDTRegexp = regexp.MustCompile(`\/mdt\d(\/|\z)`) 48 49 // basedirCmd represents the basedir command. 50 var basedirCmd = &cobra.Command{ 51 Use: "basedir", 52 Short: "Calculate base directories for every unix group.", 53 Long: `Calculate base directories for every unix group. 54 55 Provide the unique subdirectory of your 'wrstat multi -w' directory as an unamed 56 argument to this command. 57 58 This is called by 'wrstat multi' after the combine step has completed. It does 59 some 'wrstat where'-type calls for every unix group to come up with hopefully 60 meaningful and useful "base directories" for every group. 61 62 Unlike the real 'wrstat where', this is not restricted by authorization and 63 directly accesses the database files to see all data. 64 65 A base directory is a directory where all a group's data lies nested within. 66 67 Since a group could have files in multiple mount points mounted at /, the true 68 base directory would likely always be '/', which wouldn't be useful. Instead, 69 a 'wrstat where' split of 4 is used, and only paths consisting of at least 4 70 sub directories are returned. Paths that are subdirectories of other results are 71 ignored. As a special case, if a path contains 'mdt[n]' as a directory, where n 72 is a number, then 5 sub directories are required. 73 74 The output file format is 2 tab separated columns with the following contents: 75 1. Unix group ID. 76 2. Absolute path of base directory. 77 78 The output file has the name 'base.dirs' in the given directory.`, 79 Run: func(cmd *cobra.Command, args []string) { 80 if len(args) != 1 { 81 die("you must supply the path to your unique subdir of your 'wrstat multi -w' working directory") 82 } 83 84 t := time.Now() 85 tree, err := dgut.NewTree(dgutDBCombinePaths(args[0])...) 86 if err != nil { 87 die("failed to load dgut databases: %s", err) 88 } 89 info("opening databases took %s", time.Since(t)) 90 91 t = time.Now() 92 gids, err := getAllGIDsInTree(tree) 93 if err != nil { 94 die("failed to get all unix groups: %s", err) 95 } 96 info("getting GIDs took %s", time.Since(t)) 97 98 t = time.Now() 99 err = calculateBaseDirs(tree, filepath.Join(args[0], basedirBasename), gids) 100 if err != nil { 101 die("failed to create base.dirs: %s", err) 102 } 103 info("calculating base dirs took %s", time.Since(t)) 104 }, 105 } 106 107 func init() { 108 RootCmd.AddCommand(basedirCmd) 109 } 110 111 // dgutDBCombinePaths returns the dgut db directories that 'wrstat combine' 112 // creates in the given output directory. 113 func dgutDBCombinePaths(dir string) []string { 114 paths, err := filepath.Glob(fmt.Sprintf("%s/*/*/%s", dir, combineDGUTOutputFileBasename)) 115 if err != nil || len(paths) == 0 { 116 die("failed to find dgut database directories based on [%s/*/*/%s] (err: %s)", 117 dir, combineDGUTOutputFileBasename, err) 118 } 119 120 return paths 121 } 122 123 // getAllGIDsInTree gets all the unix group IDs that own files in the given file 124 // tree. 125 func getAllGIDsInTree(tree *dgut.Tree) ([]uint32, error) { 126 di, err := tree.DirInfo("/", nil) 127 if err != nil { 128 return nil, err 129 } 130 131 return di.Current.GIDs, nil 132 } 133 134 // calculateBaseDirs does the main work of this command. 135 func calculateBaseDirs(tree *dgut.Tree, outPath string, gids []uint32) error { 136 outFile, err := os.Create(outPath) 137 if err != nil { 138 return err 139 } 140 141 for _, gid := range gids { 142 baseDirs, errc := calculateBaseDirsOfGID(tree, gid) 143 if errc != nil { 144 return errc 145 } 146 147 if errw := writeBaseDirsOfGID(outFile, gid, baseDirs); errw != nil { 148 return errw 149 } 150 } 151 152 if err = outFile.Close(); err != nil { 153 return err 154 } 155 156 destDirInfo, err := os.Stat(filepath.Dir(outPath)) 157 if err != nil { 158 return err 159 } 160 161 return matchPerms(outPath, destDirInfo) 162 } 163 164 // calculateBaseDirsOfGID uses the tree to work out what the base directories of 165 // the given GID are. We manipulate Where() results instead of using 166 // FileLocations(), because empirically that is too noisy. 167 func calculateBaseDirsOfGID(tree *dgut.Tree, gid uint32) ([]string, error) { 168 dcss, err := tree.Where("/", &dgut.Filter{GIDs: []uint32{gid}}, basedirSplits) 169 if err != nil { 170 return nil, err 171 } 172 173 dcss.SortByDir() 174 175 var dirs []string //nolint:prealloc 176 177 var previous string 178 179 for _, ds := range dcss { 180 if notEnoughDirs(ds.Dir) || childOfPreviousResult(ds.Dir, previous) { 181 continue 182 } 183 184 dirs = append(dirs, ds.Dir) 185 previous = ds.Dir 186 } 187 188 return dirs, nil 189 } 190 191 // notEnoughDirs returns true if the given path has fewer than 4 directories. 192 // If path has an mdt directory in it, then it becomes 5 directories. 193 func notEnoughDirs(path string) bool { 194 numDirs := strings.Count(path, "/") 195 196 min := basedirMinDirs 197 if basedirMDTRegexp.MatchString(path) { 198 min = basedirMinDirsMDT 199 } 200 201 return numDirs < min 202 } 203 204 // childOfPreviousResult returns true if previous is not blank, and dir starts 205 // with it. 206 func childOfPreviousResult(dir, previous string) bool { 207 return previous != "" && strings.HasPrefix(dir, previous) 208 } 209 210 // writeBaseDirsOfGID writes entries to the output file for the given gid and 211 // its base directories. 212 func writeBaseDirsOfGID(outFile *os.File, gid uint32, dirs []string) error { 213 for _, dir := range dirs { 214 if _, err := outFile.WriteString(fmt.Sprintf("%d\t%s\n", gid, dir)); err != nil { 215 return err 216 } 217 } 218 219 return nil 220 }