github.com/wtsi-ssg/wrstat@v1.1.4-0.20221008232152-3030622a8cf8/cmd/tidy.go (about)

     1  /*******************************************************************************
     2   * Copyright (c) 2021-2022 Genome Research Ltd.
     3   *
     4   * Author: Sendu Bala <sb10@sanger.ac.uk>
     5   *
     6   * Permission is hereby granted, free of charge, to any person obtaining
     7   * a copy of this software and associated documentation files (the
     8   * "Software"), to deal in the Software without restriction, including
     9   * without limitation the rights to use, copy, modify, merge, publish,
    10   * distribute, sublicense, and/or sell copies of the Software, and to
    11   * permit persons to whom the Software is furnished to do so, subject to
    12   * the following conditions:
    13   *
    14   * The above copyright notice and this permission notice shall be included
    15   * in all copies or substantial portions of the Software.
    16   *
    17   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    18   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    19   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
    20   * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
    21   * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    22   * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
    23   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    24   ******************************************************************************/
    25  
    26  package cmd
    27  
    28  import (
    29  	"errors"
    30  	"fmt"
    31  	"io/fs"
    32  	"os"
    33  	"path/filepath"
    34  	"syscall"
    35  	"time"
    36  
    37  	"github.com/spf13/cobra"
    38  	shutil "github.com/termie/go-shutil"
    39  )
    40  
    41  // modeRW are the read-write permission bits for user, group and other.
    42  const modeRW = 0666
    43  
    44  const dgutDBsSuffix = "dgut.dbs"
    45  const pathSizeDBsSuffix = "pathsize.dbs"
    46  const dbsSentinelBasename = ".dbs.updated"
    47  
    48  // options for this cmd.
    49  var tidyDir string
    50  var tidyDate string
    51  
    52  // tidyCmd represents the tidy command.
    53  var tidyCmd = &cobra.Command{
    54  	Use:   "tidy",
    55  	Short: "Tidy up multi output.",
    56  	Long: `Tidy up multi output.
    57  
    58  This is called by 'wrstat multi' after the main work has completed. It moves
    59  final output files from the supplied unique working directory to the
    60  --final_output directory, then deletes the working direcory.
    61  
    62  multi creates a unique ("multi unique") directory, in which it creates
    63  directories named after the basename of the directory of interest
    64  ("interest basename"), in which it creates another unique directory ("interest
    65  unique"), in which it creates the output files.
    66  
    67  tidy assumes the working directory you give it is the "multi unique" from multi.
    68  It probably won't do the right thing if not.
    69  
    70  Final output files are named to include the given --date as follows:
    71  [date]_[interest basename].[interest unique].[multi unique].[suffix]
    72  
    73  Where [suffix] is one of 'stats.gz', 'byusergroup.gz', 'bygroup' or 'logs.gz'.
    74  
    75  The base.dirs file directly inside the given "multi unique" directory is named:
    76  [date]_[multi unique].basedirs
    77  
    78  It also moves the combine.dgut.db directories to inside a directory named:
    79  [date]_[multi unique].dgut.dbs
    80  (making them sequentially numbered sub-directories)
    81  
    82  Likewise, it moves the combine.pathsize.db files to inside a directory named:
    83  [date]_[multi unique].pathsize.dbs
    84  (making them sequentially numbered files)
    85  
    86  Finally, it creates or touches a file named '.dgut.dbs.updated' in the
    87  --final_output directory, giving it an mtime matching the oldest mtime of the
    88  walk log files. 'wrstat server' will use this file to reload its database and
    89  update its knowledge of when the data was captured.
    90  
    91  The output files will be given the same user:group ownership and
    92  user,group,other read & write permissions as the --final_output directory.
    93  
    94  Once all output files have been moved, the "multi unique" directory is deleted.
    95  
    96  It is safe to call this multiple times if it was, for example, killed half way
    97  through; it won't clobber final outputs already moved.`,
    98  	Run: func(cmd *cobra.Command, args []string) {
    99  		if tidyDir == "" {
   100  			die("--final_output is required")
   101  		}
   102  		if len(args) != 1 {
   103  			die("exactly 1 unique working directory from 'wrstat multi' must be supplied")
   104  		}
   105  
   106  		destDir, err := filepath.Abs(tidyDir)
   107  		if err != nil {
   108  			die("could not determine absolute path to --final_output dir: %s", err)
   109  		}
   110  
   111  		err = os.MkdirAll(destDir, userOnlyPerm)
   112  		if err != nil {
   113  			die("failed to create --final_output dir [%s]: %s", destDir, err)
   114  		}
   115  
   116  		destDirInfo, err := os.Stat(destDir)
   117  		if err != nil {
   118  			die("could not stat the --final_output dir: %s", err)
   119  		}
   120  
   121  		sourceDir, err := filepath.Abs(args[0])
   122  		if err != nil {
   123  			die("could not determine absolute path to source dir: %s", err)
   124  		}
   125  
   126  		err = moveAndDelete(sourceDir, destDir, destDirInfo, tidyDate)
   127  		if err != nil {
   128  			die("failed to tidy: %s", err)
   129  		}
   130  	},
   131  }
   132  
   133  func init() {
   134  	RootCmd.AddCommand(tidyCmd)
   135  
   136  	// flags specific to this sub-command
   137  	tidyCmd.Flags().StringVarP(&tidyDir, "final_output", "f", "", "final output directory")
   138  	tidyCmd.Flags().StringVarP(&tidyDate, "date", "d", "", "datestamp of when 'wrstat multi' was called")
   139  }
   140  
   141  // moveAndDelete does the main work of this cmd.
   142  func moveAndDelete(sourceDir, destDir string, destDirInfo fs.FileInfo, date string) error {
   143  	if err := findAndMoveOutputs(sourceDir, destDir, destDirInfo, date,
   144  		combineStatsOutputFileBasename, "stats.gz"); err != nil {
   145  		return err
   146  	}
   147  
   148  	if err := findAndMoveOutputs(sourceDir, destDir, destDirInfo, date,
   149  		combineUserGroupOutputFileBasename, "byusergroup.gz"); err != nil {
   150  		return err
   151  	}
   152  
   153  	if err := findAndMoveOutputs(sourceDir, destDir, destDirInfo, date,
   154  		combineGroupOutputFileBasename, "bygroup"); err != nil {
   155  		return err
   156  	}
   157  
   158  	if err := findAndMoveOutputs(sourceDir, destDir, destDirInfo, date,
   159  		combineLogOutputFileBasename, "logs.gz"); err != nil {
   160  		return err
   161  	}
   162  
   163  	if err := moveBaseDirsFile(sourceDir, destDir, destDirInfo, date); err != nil {
   164  		return err
   165  	}
   166  
   167  	if err := findAndMoveDBs(sourceDir, destDir, destDirInfo, date); err != nil {
   168  		return err
   169  	}
   170  
   171  	return os.RemoveAll(sourceDir)
   172  }
   173  
   174  // findAndMoveOutputs finds output files in the given sourceDir with given
   175  // suffix and moves them to destDir, including date in the name, and adjusting
   176  // ownership and permissions to match the destDir.
   177  func findAndMoveOutputs(sourceDir, destDir string, destDirInfo fs.FileInfo,
   178  	date, inputSuffix, outputSuffix string) error {
   179  	outputPaths, err := filepath.Glob(fmt.Sprintf("%s/*/*/%s", sourceDir, inputSuffix))
   180  	if err != nil {
   181  		return err
   182  	}
   183  
   184  	err = moveOutputs(outputPaths, destDir, destDirInfo, date, outputSuffix)
   185  	if err != nil {
   186  		return err
   187  	}
   188  
   189  	return nil
   190  }
   191  
   192  // moveOutputs calls moveOutput() on each outputPaths source file.
   193  func moveOutputs(outputPaths []string, destDir string, destDirInfo fs.FileInfo, date, suffix string) error {
   194  	for _, path := range outputPaths {
   195  		err := moveOutput(path, destDir, destDirInfo, date, suffix)
   196  		if err != nil {
   197  			return err
   198  		}
   199  	}
   200  
   201  	return nil
   202  }
   203  
   204  // moveOutput moves an output file to the finalDir and changes its name to
   205  // the correct format, then adjusts ownership and permissions to match the
   206  // destDir.
   207  func moveOutput(source string, destDir string, destDirInfo fs.FileInfo, date, suffix string) error {
   208  	interestUniqueDir := filepath.Dir(source)
   209  	interestBaseDir := filepath.Dir(interestUniqueDir)
   210  	multiUniqueDir := filepath.Dir(interestBaseDir)
   211  	dest := filepath.Join(destDir, fmt.Sprintf("%s_%s.%s.%s.%s",
   212  		date,
   213  		filepath.Base(interestBaseDir),
   214  		filepath.Base(interestUniqueDir),
   215  		filepath.Base(multiUniqueDir),
   216  		suffix))
   217  
   218  	return renameAndMatchPerms(source, dest, destDirInfo)
   219  }
   220  
   221  // renameAndMatchPerms tries 2 ways to rename the file (resorting to a copy if
   222  // this is across filesystem boundaries), then matches the dest file permissions
   223  // to the given FileInfo.
   224  //
   225  // If source doesn't exist, but dest does, assumes the rename was done
   226  // previously and just tries to match the permissions.
   227  func renameAndMatchPerms(source, dest string, destDirInfo fs.FileInfo) error {
   228  	if _, err := os.Stat(source); errors.Is(err, os.ErrNotExist) {
   229  		if _, err = os.Stat(dest); err == nil {
   230  			return matchPerms(dest, destDirInfo)
   231  		}
   232  	}
   233  
   234  	err := os.Rename(source, dest)
   235  	if err != nil {
   236  		if err = shutil.CopyFile(source, dest, false); err != nil {
   237  			return err
   238  		}
   239  	}
   240  
   241  	return matchPerms(dest, destDirInfo)
   242  }
   243  
   244  // matchPerms ensures that the given file has the same ownership and read-write
   245  // permissions as the given fileinfo.
   246  func matchPerms(path string, desired fs.FileInfo) error {
   247  	current, err := os.Stat(path)
   248  	if err != nil {
   249  		return err
   250  	}
   251  
   252  	if err = matchOwnership(path, current, desired); err != nil {
   253  		return err
   254  	}
   255  
   256  	return matchReadWrite(path, current, desired)
   257  }
   258  
   259  // matchOwnership ensures that the given file with the current fileinfo has the
   260  // same user and group ownership as the desired fileinfo.
   261  func matchOwnership(path string, current, desired fs.FileInfo) error {
   262  	uid, gid := getUIDAndGID(current)
   263  	desiredUID, desiredGID := getUIDAndGID(desired)
   264  
   265  	if uid == desiredUID && gid == desiredGID {
   266  		return nil
   267  	}
   268  
   269  	return os.Lchown(path, desiredUID, desiredGID)
   270  }
   271  
   272  // getUIDAndGID extracts the UID and GID from a FileInfo. NB: this will only
   273  // work on linux.
   274  func getUIDAndGID(info fs.FileInfo) (int, int) {
   275  	return int(info.Sys().(*syscall.Stat_t).Uid), int(info.Sys().(*syscall.Stat_t).Gid) //nolint:forcetypeassert
   276  }
   277  
   278  // matchReadWrite ensures that the given file with the current fileinfo has the
   279  // same user,group,other read&write permissions as the desired fileinfo.
   280  func matchReadWrite(path string, current, desired fs.FileInfo) error {
   281  	currentMode := current.Mode()
   282  	currentRW := currentMode & modeRW
   283  	desiredRW := desired.Mode() & modeRW
   284  
   285  	if currentRW == desiredRW {
   286  		return nil
   287  	}
   288  
   289  	return os.Chmod(path, currentMode|desiredRW)
   290  }
   291  
   292  // moveBaseDirsFile moves the base.dirs file in sourceDir to a uniquely named
   293  // .basedirs file in destDir that includes the given date.
   294  func moveBaseDirsFile(sourceDir, destDir string, destDirInfo fs.FileInfo, date string) error {
   295  	source := filepath.Join(sourceDir, basedirBasename)
   296  
   297  	dest := filepath.Join(destDir, fmt.Sprintf("%s_%s.basedirs",
   298  		date,
   299  		filepath.Base(sourceDir)))
   300  
   301  	return renameAndMatchPerms(source, dest, destDirInfo)
   302  }
   303  
   304  // findAndMoveDBs finds the combine.dgut.db directories in the given sourceDir
   305  // and moves them to a uniquely named dir in destDir that includes the given
   306  // date, and adjusts ownership and permissions to match the destDir.
   307  //
   308  // Likewise, it finds the combine.pathsize.db files in the given sourceDir and
   309  // moves them to a similarly named dir in destDir, with ownership and perms
   310  // matched.
   311  //
   312  // It also touches a file that 'wrstat server' monitors to know when to reload
   313  // its database files. It gives that file an mtime corresponding to the oldest
   314  // mtime of the walk log files.
   315  func findAndMoveDBs(sourceDir, destDir string, destDirInfo fs.FileInfo, date string) error {
   316  	err := findAndMoveSpecificDBs(combineDGUTOutputFileBasename, dgutDBsSuffix,
   317  		sourceDir, destDir, destDirInfo, date)
   318  	if err != nil {
   319  		return err
   320  	}
   321  
   322  	err = findAndMoveSpecificDBs(combinePathSizeOutputFileBasename, pathSizeDBsSuffix,
   323  		sourceDir, destDir, destDirInfo, date)
   324  	if err != nil {
   325  		return err
   326  	}
   327  
   328  	return touchDBUpdatedFile(sourceDir, destDir, destDirInfo)
   329  }
   330  
   331  func findAndMoveSpecificDBs(sourceBase, destSuffix,
   332  	sourceDir, destDir string, destDirInfo fs.FileInfo, date string) error {
   333  	sources, errg := filepath.Glob(fmt.Sprintf("%s/*/*/%s", sourceDir, sourceBase))
   334  	if errg != nil {
   335  		return errg
   336  	}
   337  
   338  	dbsDir, err := makeDBsDir(sourceDir, destDir, destSuffix, destDirInfo, date)
   339  	if err != nil {
   340  		return err
   341  	}
   342  
   343  	for i, source := range sources {
   344  		if _, err = os.Stat(source); err != nil {
   345  			return err
   346  		}
   347  
   348  		dest := filepath.Join(dbsDir, fmt.Sprintf("%d", i))
   349  
   350  		err = renameAndMatchPerms(source, dest, destDirInfo)
   351  		if err != nil {
   352  			return err
   353  		}
   354  	}
   355  
   356  	return matchPermsInsideDir(dbsDir, destDirInfo)
   357  }
   358  
   359  // makeDBsDir makes a uniquely named directory featuring the given date to hold
   360  // database files in destDir. If it already exists, does nothing. Returns the
   361  // path to the database directory and any error.
   362  func makeDBsDir(sourceDir, destDir, suffix string, destDirInfo fs.FileInfo, date string) (string, error) {
   363  	dbsDir := filepath.Join(destDir, fmt.Sprintf("%s_%s.%s",
   364  		date,
   365  		filepath.Base(sourceDir),
   366  		suffix,
   367  	))
   368  
   369  	err := os.Mkdir(dbsDir, destDirInfo.Mode().Perm())
   370  	if os.IsExist(err) {
   371  		err = nil
   372  	}
   373  
   374  	return dbsDir, err
   375  }
   376  
   377  // matchPermsInsideDir does matchPerms for all the files in the given dir
   378  // recursively.
   379  func matchPermsInsideDir(dir string, desired fs.FileInfo) error {
   380  	return filepath.WalkDir(dir, func(path string, de fs.DirEntry, err error) error {
   381  		if err != nil {
   382  			return err
   383  		}
   384  
   385  		return matchPerms(path, desired)
   386  	})
   387  }
   388  
   389  // touchDBUpdatedFile touches a file that the server monitors so that it knows
   390  // to try and reload the databases. Matches the permissions of the touched file
   391  // to the given permissions. Gives the file an mtime corresponding to the oldest
   392  // mtime of walk log files.
   393  func touchDBUpdatedFile(sourceDir, destDir string, desired fs.FileInfo) error {
   394  	sentinel := filepath.Join(destDir, dbsSentinelBasename)
   395  
   396  	oldest, err := getOldestMtimeOfWalkFiles(sourceDir)
   397  	if err != nil {
   398  		return err
   399  	}
   400  
   401  	_, err = os.Stat(sentinel)
   402  	if os.IsNotExist(err) {
   403  		if err = createFile(sentinel); err != nil {
   404  			return err
   405  		}
   406  	}
   407  
   408  	if err = touchFile(sentinel, oldest); err != nil {
   409  		return err
   410  	}
   411  
   412  	return matchPerms(sentinel, desired)
   413  }
   414  
   415  // createFile creates the given path.
   416  func createFile(path string) error {
   417  	file, err := os.Create(path)
   418  	if err != nil {
   419  		return err
   420  	}
   421  
   422  	file.Close()
   423  
   424  	return nil
   425  }
   426  
   427  // touchFile updates the a&mtime of the given path to the given time.
   428  func touchFile(path string, t time.Time) error {
   429  	return os.Chtimes(path, t.Local(), t.Local())
   430  }
   431  
   432  // getOldestMtimeOfWalkFiles looks in sourceDir for walk log files and returns
   433  // their oldest mtime.
   434  func getOldestMtimeOfWalkFiles(dir string) (time.Time, error) {
   435  	paths, err := filepath.Glob(fmt.Sprintf("%s/*/*/*%s", dir, statLogOutputFileSuffix))
   436  	if err != nil || len(paths) == 0 {
   437  		die("failed to find walk log files based on [%s/*/*/*%s] (err: %s)", dir, statLogOutputFileSuffix, err)
   438  	}
   439  
   440  	oldestT := time.Now()
   441  
   442  	for _, path := range paths {
   443  		info, err := os.Stat(path)
   444  		if err != nil {
   445  			return time.Time{}, err
   446  		}
   447  
   448  		if info.ModTime().Before(oldestT) {
   449  			oldestT = info.ModTime()
   450  		}
   451  	}
   452  
   453  	return oldestT, nil
   454  }