github.com/wtsi-ssg/wrstat/v4@v4.5.1/cmd/stat.go (about)

     1  /*******************************************************************************
     2   * Copyright (c) 2021-2022 Genome Research Ltd.
     3   *
     4   * Author: Sendu Bala <sb10@sanger.ac.uk>
     5   *
     6   * Permission is hereby granted, free of charge, to any person obtaining
     7   * a copy of this software and associated documentation files (the
     8   * "Software"), to deal in the Software without restriction, including
     9   * without limitation the rights to use, copy, modify, merge, publish,
    10   * distribute, sublicense, and/or sell copies of the Software, and to
    11   * permit persons to whom the Software is furnished to do so, subject to
    12   * the following conditions:
    13   *
    14   * The above copyright notice and this permission notice shall be included
    15   * in all copies or substantial portions of the Software.
    16   *
    17   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    18   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    19   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
    20   * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
    21   * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    22   * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
    23   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    24   ******************************************************************************/
    25  
    26  package cmd
    27  
    28  import (
    29  	"io/fs"
    30  	"os"
    31  	"time"
    32  
    33  	"github.com/spf13/cobra"
    34  	"github.com/wtsi-ssg/wrstat/v4/ch"
    35  	"github.com/wtsi-ssg/wrstat/v4/stat"
    36  	"github.com/wtsi-ssg/wrstat/v4/summary"
    37  )
    38  
    39  const reportFrequency = 10 * time.Minute
    40  const statOutputFileSuffix = ".stats"
    41  const statUserGroupSummaryOutputFileSuffix = ".byusergroup"
    42  const statGroupSummaryOutputFileSuffix = ".bygroup"
    43  const statDGUTSummaryOutputFileSuffix = ".dgut"
    44  const statLogOutputFileSuffix = ".log"
    45  const lstatTimeout = 10 * time.Second
    46  const lstatAttempts = 3
    47  
    48  var statDebug bool
    49  var statCh string
    50  
    51  // statCmd represents the stat command.
    52  var statCmd = &cobra.Command{
    53  	Use:   "stat",
    54  	Short: "Stat paths",
    55  	Long: `Stat paths in a given file.
    56  
    57  Given a file containing an absolute file path per line (eg. as produced by
    58  'wrstat walk'), this creates a new file with stats for each of those file paths.
    59  The new file is named after the input file with a ".stats" suffix.
    60  
    61  The output file format is 11 tab separated columns with the following contents:
    62  1. Base64 encoded path to the file.
    63  2. File size in bytes. If this is greater than the number of bytes in blocks
    64     allocated, this will be the number of bytes in allocated blocks. (This is to
    65     account for files with holes in them; as a byproduct, symbolic links will
    66     be reported as 0 size.)
    67  3. UID.
    68  4. GID.
    69  5. Atime (time of most recent access expressed in seconds).
    70  6. Mtime (time of most recent content modification expressed in seconds.)
    71  7. Ctime (on unix, the time of most recent metadata change in seconds).
    72  8. Filetype:
    73     'f': regular file
    74     'l': symbolic link
    75     's': socket
    76     'b': block special device file
    77     'c': character special device file
    78     'F': FIFO (named pipe)
    79     'X': anything else
    80  9. Inode number (on unix).
    81  10. Number of hard links.
    82  11. Identifier of the device on which this file resides.
    83  
    84  It also summarises file count and size information by grouping on
    85  user+group+directory, and stores this summary in another file named after the
    86  input file with a ".byusergroup" suffix. This is 5 tab separated columns with
    87  the following contents (sorted on the first 3 columns):
    88  
    89  1. username
    90  2. unix group name
    91  3. directory
    92  4. number of files nested under 3 belonging to both 1 & 2.
    93  5. total file size in bytes of the files in 4.
    94  
    95  For example, if user joe using unix group lemur had written 2 10 byte files to
    96  /disk1/dir1, 3 files to /disk1/dir1/dir1a, 1 file to /disk1/dir2, and 1 file to
    97  /disk1/dir1 as unix group fish, then the output would be:
    98  
    99  joe	fish	/disk1	1	10
   100  joe	fish	/disk1/dir1	1	10
   101  joe	lemur	/disk1	6	60
   102  joe	lemur	/disk1/dir1	5	50
   103  joe	lemur	/disk1/dir1/dir1a	3	30
   104  joe	lemur	/disk1/dir2	1	10
   105  
   106  Likewise, it produces a similar file that also shows nested numbers, with these
   107  7 tab separated columns, with a ".dgut" suffix:
   108  
   109  1. directory
   110  2. gid
   111  3. uid
   112  4. filetype - an int with the following meaning: 
   113       0 = other (not any of the others below)
   114       1 = temp (.tmp | temp suffix, or .tmp. | .temp. | tmp. | temp. prefix, or
   115                 a directory in its path is named "tmp" or "temp")
   116       2 = vcf
   117       3 = vcf.gz
   118       4 = bcf
   119       5 = sam
   120       6 = bam
   121       7 = cram
   122       8 = fasta (.fa | .fasta suffix)
   123       9 = fastq (.fq | .fastq suffix)
   124      10 = fastq.gz (.fq.gz | .fastq.gz suffix)
   125      11 = ped/bed (.ped | .map | .bed | .bim | .fam suffix)
   126      12 = compresed (.bzip2 | .gz | .tgz | .zip | .xz | .bgz suffix)
   127      13 = text (.csv | .tsv | .txt | .text | .md | .dat | readme suffix)
   128      14 = log (.log | .out | .o | .err | .e | .err | .oe suffix)
   129  5. number of files nested under 1 belonging to 2 and 3 and having filetype in 4.
   130  6. total file size in bytes of the files in 5.
   131  7. the oldest access time of the files in 5, in seconds since Unix epoch.
   132  8. the newest modified time of the files in 5, in seconds since Unix epoch.
   133  
   134  (Note that files can be both "temp" and one of the other types, so ignore lines
   135  where column 4 is 1 if summing up columns 5 and 6 for a given 1+2+3 for an
   136  "all filetypes" query.)
   137  
   138  It also summarises file count and size information by grouping on group+user,
   139  and stores this summary in another file named after the input file with a
   140  ".bygroup" suffix. This is 4 tab separated columns with the following contents
   141  (sorted on the first 2 columns):
   142  
   143  1. unix group name
   144  2. username
   145  3. number of files belonging to both 1 & 2.
   146  4. total file size in bytes of the files in 3.
   147  
   148  If you supply a tsv file to --ch with the following columns:
   149  directory user group fileperms dirperms
   150  [where *perms format is rwxrwxrwx for user,group,other, where - means remove the
   151  permission, * means leave it unchanged, and one of [rwx] means set it. s for the
   152  group x would enable setting group sticky bit. s implies x. Using ^ in at
   153  least 2 equivalent places means "set all if any set". ie. '**^**^***' would mean
   154  "change nothing, except if execute is set on user or group, set it on both".
   155  user and group can be unix username or unix group name. * means don't set it.
   156  Use ^ to mean copy from the directory.
   157  The file can have blank lines and comment lines that begin with #, which will be
   158  ignored.]
   159  Then any input filesystem path in one of those directories will have its
   160  permissions and ownership changed if needed.
   161  
   162  (Any changes caused by this will not be reflected in the output file, since
   163  the chmod and chown operations happen after path's stats are retrieved.)
   164  
   165  Finally, log messages (including things like warnings and errors while working
   166  on the above) are stored in another file named after the input file with a
   167  ".log" suffix.
   168  `,
   169  	Run: func(cmd *cobra.Command, args []string) {
   170  		if len(args) != 1 {
   171  			die("exactly 1 input file should be provided")
   172  		}
   173  
   174  		logToFile(args[0] + statLogOutputFileSuffix)
   175  
   176  		statPathsInFile(args[0], statCh, statDebug)
   177  	},
   178  }
   179  
   180  func init() {
   181  	RootCmd.AddCommand(statCmd)
   182  
   183  	statCmd.Flags().StringVar(&statCh, "ch", "", "tsv file detailing paths to chmod & chown")
   184  	statCmd.Flags().BoolVar(&statDebug, "debug", false, "output Lstat timings")
   185  }
   186  
   187  // statPathsInFile does the main work.
   188  func statPathsInFile(inputPath string, tsvPath string, debug bool) {
   189  	input, err := os.Open(inputPath)
   190  	if err != nil {
   191  		die("failed to open input file: %s", err)
   192  	}
   193  
   194  	defer func() {
   195  		err = input.Close()
   196  		if err != nil {
   197  			warn("failed to close input file: %s", err)
   198  		}
   199  	}()
   200  
   201  	scanAndStatInput(input, createStatOutputFile(inputPath), tsvPath, debug)
   202  }
   203  
   204  // createStatOutputFile creates a file named input.stats.
   205  func createStatOutputFile(input string) *os.File {
   206  	return createOutputFileWithSuffix(input, statOutputFileSuffix)
   207  }
   208  
   209  // createOutputFileWithSuffix creates an output file named after prefixPath
   210  // appended with suffix.
   211  func createOutputFileWithSuffix(prefixPath, suffix string) *os.File {
   212  	output, err := os.Create(prefixPath + suffix)
   213  	if err != nil {
   214  		die("failed to create output file: %s", err)
   215  	}
   216  
   217  	return output
   218  }
   219  
   220  // scanAndStatInput scans through the input, stats each path, and outputs the
   221  // results to the output.
   222  //
   223  // If tsvPath is not empty, also does chmod and chown operations on certain
   224  // paths.
   225  //
   226  // If debug is true, outputs timings for Lstat calls and other operations.
   227  func scanAndStatInput(input, output *os.File, tsvPath string, debug bool) {
   228  	var frequency time.Duration
   229  	if debug {
   230  		frequency = reportFrequency
   231  	}
   232  
   233  	statter := stat.WithTimeout(lstatTimeout, lstatAttempts, appLogger)
   234  	p := stat.NewPaths(statter, appLogger, frequency)
   235  
   236  	if err := p.AddOperation("file", stat.FileOperation(output)); err != nil {
   237  		die("%s", err)
   238  	}
   239  
   240  	postScan, err := addSummaryOperations(input.Name(), p)
   241  	if err != nil {
   242  		die("%s", err)
   243  	}
   244  
   245  	if err = addChOperation(tsvPath, p); err != nil {
   246  		die("%s", err)
   247  	}
   248  
   249  	if err = p.Scan(input); err != nil {
   250  		die("%s", err)
   251  	}
   252  
   253  	if err = postScan(); err != nil {
   254  		die("%s", err)
   255  	}
   256  }
   257  
   258  // addSummaryOperations adds summary operations to p. Returns a function that
   259  // should be called after p.Scan.
   260  func addSummaryOperations(input string, p *stat.Paths) (func() error, error) {
   261  	outputUserGroupSummaryData, err := addUserGroupSummaryOperation(input, p)
   262  	if err != nil {
   263  		return nil, err
   264  	}
   265  
   266  	outputGroupSummaryData, err := addGroupSummaryOperation(input, p)
   267  	if err != nil {
   268  		return nil, err
   269  	}
   270  
   271  	outputDGUTSummaryData, err := addDGUTSummaryOperation(input, p)
   272  	if err != nil {
   273  		return nil, err
   274  	}
   275  
   276  	return func() error {
   277  		if err = outputUserGroupSummaryData(); err != nil {
   278  			return err
   279  		}
   280  
   281  		if err = outputGroupSummaryData(); err != nil {
   282  			return err
   283  		}
   284  
   285  		return outputDGUTSummaryData()
   286  	}, nil
   287  }
   288  
   289  // addUserGroupSummaryOperation adds an operation to Paths that collects [user,
   290  // group, directory, count, size] summary information. It returns a function
   291  // that you should call after calling p.Scan(), which outputs the summary data
   292  // to file.
   293  func addUserGroupSummaryOperation(input string, p *stat.Paths) (func() error, error) {
   294  	ug := summary.NewByUserGroup()
   295  
   296  	return addSummaryOperator(input, statUserGroupSummaryOutputFileSuffix, "usergroup", p, ug)
   297  }
   298  
   299  // outputOperators are types returned by summary.New*().
   300  type outputOperator interface {
   301  	Add(path string, info fs.FileInfo) error
   302  	Output(output summary.StringCloser) error
   303  }
   304  
   305  // addSummaryOperator adds the operation method of o to p after creating an
   306  // output file with given suffix. Returns function that actually writes to the
   307  // output.
   308  func addSummaryOperator(input, suffix, logName string, p *stat.Paths, o outputOperator) (func() error, error) {
   309  	output := createOutputFileWithSuffix(input, suffix)
   310  
   311  	err := p.AddOperation(logName, o.Add)
   312  
   313  	return func() error {
   314  		return o.Output(output)
   315  	}, err
   316  }
   317  
   318  // addGroupSummaryOperation adds an operation to Paths that collects [group,
   319  // user, count, size] summary information. It returns a function that you should
   320  // call after calling p.Scan(), which outputs the summary data to file.
   321  func addGroupSummaryOperation(input string, p *stat.Paths) (func() error, error) {
   322  	g := summary.NewByGroupUser()
   323  
   324  	return addSummaryOperator(input, statGroupSummaryOutputFileSuffix, "group", p, g)
   325  }
   326  
   327  // addDGUTSummaryOperation adds an operation to Paths that collects [directory,
   328  // group, user, filetype, count, size] summary information. It returns a
   329  // function that you should call after calling p.Scan(), which outputs the
   330  // summary data to file.
   331  func addDGUTSummaryOperation(input string, p *stat.Paths) (func() error, error) {
   332  	d := summary.NewByDirGroupUserType()
   333  
   334  	return addSummaryOperator(input, statDGUTSummaryOutputFileSuffix, "dgut", p, d)
   335  }
   336  
   337  // addChOperation adds the chmod&chown operation to the Paths if the tsv file
   338  // has valid contents. No-op if tsvPath is blank.
   339  func addChOperation(tsvPath string, p *stat.Paths) error {
   340  	if tsvPath == "" {
   341  		return nil
   342  	}
   343  
   344  	f, err := os.Open(tsvPath)
   345  	if err != nil {
   346  		return err
   347  	}
   348  
   349  	defer f.Close()
   350  
   351  	rs, err := ch.NewRulesStore().FromTSV(ch.NewTSVReader(f))
   352  	if err != nil {
   353  		return err
   354  	}
   355  
   356  	c := ch.New(rs, appLogger)
   357  
   358  	return p.AddOperation("ch", c.Do)
   359  }