github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/deployments/tools/export/lakefs_export.py (about)

     1  #!/usr/bin/env python3
     2  
     3  import argparse
     4  import subprocess
     5  from tempfile import NamedTemporaryFile
     6  import os
     7  import posixpath  # Works for URL pathname manipulation on Windows too
     8  import sys
     9  import time
    10  from datetime import datetime
    11  from string import Template
    12  
    13  LAKEFS_ACCESS_KEY = os.getenv('LAKEFS_ACCESS_KEY_ID')
    14  LAKEFS_SECRET_KEY = os.getenv('LAKEFS_SECRET_ACCESS_KEY')
    15  LAKEFS_ENDPOINT = os.getenv('LAKEFS_ENDPOINT')
    16  S3_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY_ID')
    17  S3_SECRET_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
    18  
    19  SUCCESS_MSG = "Export completed successfully!"
    20  
    21  
    22  def create_rclone_conf_file():
    23      with open('rclone.conf.template') as f:
    24          src = Template(f.read())
    25          vars = {
    26              'lakefs_access_key': LAKEFS_ACCESS_KEY,
    27              'lakefs_secret_key': LAKEFS_SECRET_KEY,
    28              'lakefs_endpoint': LAKEFS_ENDPOINT,
    29              's3_access_key': S3_ACCESS_KEY,
    30              's3_secret_key': S3_SECRET_KEY,
    31          }
    32          res = src.substitute(vars)
    33      with open('rclone.conf', 'w+') as f:
    34          f.write(res)
    35      return
    36  
    37  
    38  def set_args():
    39      parser = argparse.ArgumentParser(description='Process lakeFS export.')
    40  
    41      parser.add_argument('Repo', metavar='repo', type=str,
    42                          help='name of lakeFS repository')
    43      parser.add_argument('Dest', metavar='dest', type=str,
    44                          help='path of destination')
    45      parser.add_argument('--branch', metavar='branch', type=str, action='store',
    46                          help=('relevant branch in the repository to export '
    47                                'from'))
    48      parser.add_argument('--commit_id', metavar='commit', type=str,
    49                          action='store',
    50                          help=('relevant commit on the repository to export'
    51                                'from'))
    52      parser.add_argument('--prev_commit_id', metavar='previous-commit',
    53                          type=str, action='store',
    54                          help=('if specified, export only the difference '
    55                                'between this commit ID and the head of the '
    56                                'branch'))
    57  
    58      args = parser.parse_args()
    59      return args
    60  
    61  
    62  def process_output(dest, src):
    63      """Process rclone output on file-like object src into file dst.
    64  
    65  Rewrite lines to explain more and remove weird logging indicators."""
    66      for line in src:
    67          line = line.removesuffix('\n')
    68          if line.startswith('-'):
    69              print("path missing in source:", line, file=dest)
    70          elif line.startswith('+'):
    71              print("path missing on destination:", line, file=dest)
    72          elif line.startswith('*'):
    73              print("path different in source and destination:", line, file=dest)
    74          elif line.startswith('!'):
    75              print("error reading or hashing source or dest", line, file=dest)
    76      return
    77  
    78  
    79  def error(msg, statuscode=1):
    80      print(msg, file=sys.stderr)
    81      exit(statuscode)
    82  
    83  
    84  def main():
    85      # create rclone configuration file
    86      create_rclone_conf_file()
    87  
    88      args = set_args()
    89  
    90      reference = ""
    91      source = "lakefs:" + args.Repo + "/"
    92      has_branch = (args.branch is not None)
    93      has_commit = (args.commit_id is not None)
    94      export_diff = (args.prev_commit_id is not None)
    95      if has_branch and not has_commit:
    96          source += args.branch + "/"
    97          reference = args.branch
    98      elif not has_branch and has_commit:
    99          source += args.commit_id + "/"
   100          reference = args.commit_id
   101      elif has_branch:            # and has_commit
   102          error("Cannot set both branch and commit_id")
   103      else:                       # not has_branch and not has_commit
   104          error("Must set one of branch, commit_id")
   105  
   106      if has_commit and export_diff:
   107          error("Cannot export diff between two commits.")
   108  
   109      now = datetime.utcfromtimestamp(time.time())
   110      status_file_name_base = (f"EXPORT_{reference}_"
   111                               f"{now.strftime('%d-%m-%Y_%H:%M:%S')}")
   112  
   113      rclone_command = "sync" if export_diff else "copy"
   114      cmd = ["rclone", rclone_command, source, args.Dest, "--config=rclone.conf",
   115             "--create-empty-src-dirs"]
   116  
   117      process = subprocess.run(cmd)
   118      if process.returncode != 0:
   119          error(f"rclone {rclone_command} failed")
   120  
   121      # check export and create status file
   122      check_cmd = ["rclone", "check", source, args.Dest,
   123                   "--config=rclone.conf", "--combined=-"]
   124      # if not export_diff:
   125      #     # Use the flag --one-way to check a copy command: only need to check
   126      #     # that the source files were copied to the destination
   127      #     check_cmd.append("--one-way")
   128  
   129      check_process = subprocess.Popen(check_cmd, stdout=subprocess.PIPE,
   130                                       text=True)
   131  
   132      local_status = NamedTemporaryFile(
   133          prefix="lakefs_export_status_", suffix=".temp",
   134          mode="w", delete=False)
   135      try:
   136          # local_status cannot be re-opened for read on Windows until we
   137          # close it for writing.
   138          process_output(local_status, check_process.stdout)
   139  
   140          # rclone writes until its done, check_process.stdout is closed, so
   141          # the rclone process ended and wait will not block.
   142          check_process.wait()
   143  
   144          # Upload status file to destination bucket
   145          success = check_process.returncode == 0
   146          if success:
   147              print(SUCCESS_MSG, file=local_status)
   148          local_status.close()
   149  
   150          status_file_name = (f"{status_file_name_base}_"
   151                              f"{'SUCCESS' if success else 'FAILURE'}")
   152          dest_path = posixpath.join(args.Dest, status_file_name)
   153  
   154          upload_process = subprocess.run(["rclone", "copyto", local_status.name,
   155                                           dest_path, "--config=rclone.conf"])
   156          if upload_process.returncode != 0:
   157              print("Failed to upload status file", file=sys.stderr)
   158      finally:
   159          os.remove(local_status.name)
   160  
   161      if not success:
   162          exit(1)
   163  
   164  
   165  if __name__ == '__main__':
   166      main()