github.com/wtsi-ssg/wrstat/v4@v4.5.1/cmd/stat.go (about) 1 /******************************************************************************* 2 * Copyright (c) 2021-2022 Genome Research Ltd. 3 * 4 * Author: Sendu Bala <sb10@sanger.ac.uk> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining 7 * a copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sublicense, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included 15 * in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 20 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 21 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 ******************************************************************************/ 25 26 package cmd 27 28 import ( 29 "io/fs" 30 "os" 31 "time" 32 33 "github.com/spf13/cobra" 34 "github.com/wtsi-ssg/wrstat/v4/ch" 35 "github.com/wtsi-ssg/wrstat/v4/stat" 36 "github.com/wtsi-ssg/wrstat/v4/summary" 37 ) 38 39 const reportFrequency = 10 * time.Minute 40 const statOutputFileSuffix = ".stats" 41 const statUserGroupSummaryOutputFileSuffix = ".byusergroup" 42 const statGroupSummaryOutputFileSuffix = ".bygroup" 43 const statDGUTSummaryOutputFileSuffix = ".dgut" 44 const statLogOutputFileSuffix = ".log" 45 const lstatTimeout = 10 * time.Second 46 const lstatAttempts = 3 47 48 var statDebug bool 49 var statCh string 50 51 // statCmd represents the stat command. 52 var statCmd = &cobra.Command{ 53 Use: "stat", 54 Short: "Stat paths", 55 Long: `Stat paths in a given file. 56 57 Given a file containing an absolute file path per line (eg. as produced by 58 'wrstat walk'), this creates a new file with stats for each of those file paths. 59 The new file is named after the input file with a ".stats" suffix. 60 61 The output file format is 11 tab separated columns with the following contents: 62 1. Base64 encoded path to the file. 63 2. File size in bytes. If this is greater than the number of bytes in blocks 64 allocated, this will be the number of bytes in allocated blocks. (This is to 65 account for files with holes in them; as a byproduct, symbolic links will 66 be reported as 0 size.) 67 3. UID. 68 4. GID. 69 5. Atime (time of most recent access expressed in seconds). 70 6. Mtime (time of most recent content modification expressed in seconds.) 71 7. Ctime (on unix, the time of most recent metadata change in seconds). 72 8. Filetype: 73 'f': regular file 74 'l': symbolic link 75 's': socket 76 'b': block special device file 77 'c': character special device file 78 'F': FIFO (named pipe) 79 'X': anything else 80 9. Inode number (on unix). 81 10. Number of hard links. 82 11. Identifier of the device on which this file resides. 83 84 It also summarises file count and size information by grouping on 85 user+group+directory, and stores this summary in another file named after the 86 input file with a ".byusergroup" suffix. This is 5 tab separated columns with 87 the following contents (sorted on the first 3 columns): 88 89 1. username 90 2. unix group name 91 3. directory 92 4. number of files nested under 3 belonging to both 1 & 2. 93 5. total file size in bytes of the files in 4. 94 95 For example, if user joe using unix group lemur had written 2 10 byte files to 96 /disk1/dir1, 3 files to /disk1/dir1/dir1a, 1 file to /disk1/dir2, and 1 file to 97 /disk1/dir1 as unix group fish, then the output would be: 98 99 joe fish /disk1 1 10 100 joe fish /disk1/dir1 1 10 101 joe lemur /disk1 6 60 102 joe lemur /disk1/dir1 5 50 103 joe lemur /disk1/dir1/dir1a 3 30 104 joe lemur /disk1/dir2 1 10 105 106 Likewise, it produces a similar file that also shows nested numbers, with these 107 7 tab separated columns, with a ".dgut" suffix: 108 109 1. directory 110 2. gid 111 3. uid 112 4. filetype - an int with the following meaning: 113 0 = other (not any of the others below) 114 1 = temp (.tmp | temp suffix, or .tmp. | .temp. | tmp. | temp. prefix, or 115 a directory in its path is named "tmp" or "temp") 116 2 = vcf 117 3 = vcf.gz 118 4 = bcf 119 5 = sam 120 6 = bam 121 7 = cram 122 8 = fasta (.fa | .fasta suffix) 123 9 = fastq (.fq | .fastq suffix) 124 10 = fastq.gz (.fq.gz | .fastq.gz suffix) 125 11 = ped/bed (.ped | .map | .bed | .bim | .fam suffix) 126 12 = compresed (.bzip2 | .gz | .tgz | .zip | .xz | .bgz suffix) 127 13 = text (.csv | .tsv | .txt | .text | .md | .dat | readme suffix) 128 14 = log (.log | .out | .o | .err | .e | .err | .oe suffix) 129 5. number of files nested under 1 belonging to 2 and 3 and having filetype in 4. 130 6. total file size in bytes of the files in 5. 131 7. the oldest access time of the files in 5, in seconds since Unix epoch. 132 8. the newest modified time of the files in 5, in seconds since Unix epoch. 133 134 (Note that files can be both "temp" and one of the other types, so ignore lines 135 where column 4 is 1 if summing up columns 5 and 6 for a given 1+2+3 for an 136 "all filetypes" query.) 137 138 It also summarises file count and size information by grouping on group+user, 139 and stores this summary in another file named after the input file with a 140 ".bygroup" suffix. This is 4 tab separated columns with the following contents 141 (sorted on the first 2 columns): 142 143 1. unix group name 144 2. username 145 3. number of files belonging to both 1 & 2. 146 4. total file size in bytes of the files in 3. 147 148 If you supply a tsv file to --ch with the following columns: 149 directory user group fileperms dirperms 150 [where *perms format is rwxrwxrwx for user,group,other, where - means remove the 151 permission, * means leave it unchanged, and one of [rwx] means set it. s for the 152 group x would enable setting group sticky bit. s implies x. Using ^ in at 153 least 2 equivalent places means "set all if any set". ie. '**^**^***' would mean 154 "change nothing, except if execute is set on user or group, set it on both". 155 user and group can be unix username or unix group name. * means don't set it. 156 Use ^ to mean copy from the directory. 157 The file can have blank lines and comment lines that begin with #, which will be 158 ignored.] 159 Then any input filesystem path in one of those directories will have its 160 permissions and ownership changed if needed. 161 162 (Any changes caused by this will not be reflected in the output file, since 163 the chmod and chown operations happen after path's stats are retrieved.) 164 165 Finally, log messages (including things like warnings and errors while working 166 on the above) are stored in another file named after the input file with a 167 ".log" suffix. 168 `, 169 Run: func(cmd *cobra.Command, args []string) { 170 if len(args) != 1 { 171 die("exactly 1 input file should be provided") 172 } 173 174 logToFile(args[0] + statLogOutputFileSuffix) 175 176 statPathsInFile(args[0], statCh, statDebug) 177 }, 178 } 179 180 func init() { 181 RootCmd.AddCommand(statCmd) 182 183 statCmd.Flags().StringVar(&statCh, "ch", "", "tsv file detailing paths to chmod & chown") 184 statCmd.Flags().BoolVar(&statDebug, "debug", false, "output Lstat timings") 185 } 186 187 // statPathsInFile does the main work. 188 func statPathsInFile(inputPath string, tsvPath string, debug bool) { 189 input, err := os.Open(inputPath) 190 if err != nil { 191 die("failed to open input file: %s", err) 192 } 193 194 defer func() { 195 err = input.Close() 196 if err != nil { 197 warn("failed to close input file: %s", err) 198 } 199 }() 200 201 scanAndStatInput(input, createStatOutputFile(inputPath), tsvPath, debug) 202 } 203 204 // createStatOutputFile creates a file named input.stats. 205 func createStatOutputFile(input string) *os.File { 206 return createOutputFileWithSuffix(input, statOutputFileSuffix) 207 } 208 209 // createOutputFileWithSuffix creates an output file named after prefixPath 210 // appended with suffix. 211 func createOutputFileWithSuffix(prefixPath, suffix string) *os.File { 212 output, err := os.Create(prefixPath + suffix) 213 if err != nil { 214 die("failed to create output file: %s", err) 215 } 216 217 return output 218 } 219 220 // scanAndStatInput scans through the input, stats each path, and outputs the 221 // results to the output. 222 // 223 // If tsvPath is not empty, also does chmod and chown operations on certain 224 // paths. 225 // 226 // If debug is true, outputs timings for Lstat calls and other operations. 227 func scanAndStatInput(input, output *os.File, tsvPath string, debug bool) { 228 var frequency time.Duration 229 if debug { 230 frequency = reportFrequency 231 } 232 233 statter := stat.WithTimeout(lstatTimeout, lstatAttempts, appLogger) 234 p := stat.NewPaths(statter, appLogger, frequency) 235 236 if err := p.AddOperation("file", stat.FileOperation(output)); err != nil { 237 die("%s", err) 238 } 239 240 postScan, err := addSummaryOperations(input.Name(), p) 241 if err != nil { 242 die("%s", err) 243 } 244 245 if err = addChOperation(tsvPath, p); err != nil { 246 die("%s", err) 247 } 248 249 if err = p.Scan(input); err != nil { 250 die("%s", err) 251 } 252 253 if err = postScan(); err != nil { 254 die("%s", err) 255 } 256 } 257 258 // addSummaryOperations adds summary operations to p. Returns a function that 259 // should be called after p.Scan. 260 func addSummaryOperations(input string, p *stat.Paths) (func() error, error) { 261 outputUserGroupSummaryData, err := addUserGroupSummaryOperation(input, p) 262 if err != nil { 263 return nil, err 264 } 265 266 outputGroupSummaryData, err := addGroupSummaryOperation(input, p) 267 if err != nil { 268 return nil, err 269 } 270 271 outputDGUTSummaryData, err := addDGUTSummaryOperation(input, p) 272 if err != nil { 273 return nil, err 274 } 275 276 return func() error { 277 if err = outputUserGroupSummaryData(); err != nil { 278 return err 279 } 280 281 if err = outputGroupSummaryData(); err != nil { 282 return err 283 } 284 285 return outputDGUTSummaryData() 286 }, nil 287 } 288 289 // addUserGroupSummaryOperation adds an operation to Paths that collects [user, 290 // group, directory, count, size] summary information. It returns a function 291 // that you should call after calling p.Scan(), which outputs the summary data 292 // to file. 293 func addUserGroupSummaryOperation(input string, p *stat.Paths) (func() error, error) { 294 ug := summary.NewByUserGroup() 295 296 return addSummaryOperator(input, statUserGroupSummaryOutputFileSuffix, "usergroup", p, ug) 297 } 298 299 // outputOperators are types returned by summary.New*(). 300 type outputOperator interface { 301 Add(path string, info fs.FileInfo) error 302 Output(output summary.StringCloser) error 303 } 304 305 // addSummaryOperator adds the operation method of o to p after creating an 306 // output file with given suffix. Returns function that actually writes to the 307 // output. 308 func addSummaryOperator(input, suffix, logName string, p *stat.Paths, o outputOperator) (func() error, error) { 309 output := createOutputFileWithSuffix(input, suffix) 310 311 err := p.AddOperation(logName, o.Add) 312 313 return func() error { 314 return o.Output(output) 315 }, err 316 } 317 318 // addGroupSummaryOperation adds an operation to Paths that collects [group, 319 // user, count, size] summary information. It returns a function that you should 320 // call after calling p.Scan(), which outputs the summary data to file. 321 func addGroupSummaryOperation(input string, p *stat.Paths) (func() error, error) { 322 g := summary.NewByGroupUser() 323 324 return addSummaryOperator(input, statGroupSummaryOutputFileSuffix, "group", p, g) 325 } 326 327 // addDGUTSummaryOperation adds an operation to Paths that collects [directory, 328 // group, user, filetype, count, size] summary information. It returns a 329 // function that you should call after calling p.Scan(), which outputs the 330 // summary data to file. 331 func addDGUTSummaryOperation(input string, p *stat.Paths) (func() error, error) { 332 d := summary.NewByDirGroupUserType() 333 334 return addSummaryOperator(input, statDGUTSummaryOutputFileSuffix, "dgut", p, d) 335 } 336 337 // addChOperation adds the chmod&chown operation to the Paths if the tsv file 338 // has valid contents. No-op if tsvPath is blank. 339 func addChOperation(tsvPath string, p *stat.Paths) error { 340 if tsvPath == "" { 341 return nil 342 } 343 344 f, err := os.Open(tsvPath) 345 if err != nil { 346 return err 347 } 348 349 defer f.Close() 350 351 rs, err := ch.NewRulesStore().FromTSV(ch.NewTSVReader(f)) 352 if err != nil { 353 return err 354 } 355 356 c := ch.New(rs, appLogger) 357 358 return p.AddOperation("ch", c.Do) 359 }