github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/deployments/tools/export/lakefs_export.py (about) 1 #!/usr/bin/env python3 2 3 import argparse 4 import subprocess 5 from tempfile import NamedTemporaryFile 6 import os 7 import posixpath # Works for URL pathname manipulation on Windows too 8 import sys 9 import time 10 from datetime import datetime 11 from string import Template 12 13 LAKEFS_ACCESS_KEY = os.getenv('LAKEFS_ACCESS_KEY_ID') 14 LAKEFS_SECRET_KEY = os.getenv('LAKEFS_SECRET_ACCESS_KEY') 15 LAKEFS_ENDPOINT = os.getenv('LAKEFS_ENDPOINT') 16 S3_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY_ID') 17 S3_SECRET_KEY = os.getenv('AWS_SECRET_ACCESS_KEY') 18 19 SUCCESS_MSG = "Export completed successfully!" 20 21 22 def create_rclone_conf_file(): 23 with open('rclone.conf.template') as f: 24 src = Template(f.read()) 25 vars = { 26 'lakefs_access_key': LAKEFS_ACCESS_KEY, 27 'lakefs_secret_key': LAKEFS_SECRET_KEY, 28 'lakefs_endpoint': LAKEFS_ENDPOINT, 29 's3_access_key': S3_ACCESS_KEY, 30 's3_secret_key': S3_SECRET_KEY, 31 } 32 res = src.substitute(vars) 33 with open('rclone.conf', 'w+') as f: 34 f.write(res) 35 return 36 37 38 def set_args(): 39 parser = argparse.ArgumentParser(description='Process lakeFS export.') 40 41 parser.add_argument('Repo', metavar='repo', type=str, 42 help='name of lakeFS repository') 43 parser.add_argument('Dest', metavar='dest', type=str, 44 help='path of destination') 45 parser.add_argument('--branch', metavar='branch', type=str, action='store', 46 help=('relevant branch in the repository to export ' 47 'from')) 48 parser.add_argument('--commit_id', metavar='commit', type=str, 49 action='store', 50 help=('relevant commit on the repository to export' 51 'from')) 52 parser.add_argument('--prev_commit_id', metavar='previous-commit', 53 type=str, action='store', 54 help=('if specified, export only the difference ' 55 'between this commit ID and the head of the ' 56 'branch')) 57 58 args = parser.parse_args() 59 return args 60 61 62 def process_output(dest, src): 63 """Process rclone output on file-like object src into file dst. 64 65 Rewrite lines to explain more and remove weird logging indicators.""" 66 for line in src: 67 line = line.removesuffix('\n') 68 if line.startswith('-'): 69 print("path missing in source:", line, file=dest) 70 elif line.startswith('+'): 71 print("path missing on destination:", line, file=dest) 72 elif line.startswith('*'): 73 print("path different in source and destination:", line, file=dest) 74 elif line.startswith('!'): 75 print("error reading or hashing source or dest", line, file=dest) 76 return 77 78 79 def error(msg, statuscode=1): 80 print(msg, file=sys.stderr) 81 exit(statuscode) 82 83 84 def main(): 85 # create rclone configuration file 86 create_rclone_conf_file() 87 88 args = set_args() 89 90 reference = "" 91 source = "lakefs:" + args.Repo + "/" 92 has_branch = (args.branch is not None) 93 has_commit = (args.commit_id is not None) 94 export_diff = (args.prev_commit_id is not None) 95 if has_branch and not has_commit: 96 source += args.branch + "/" 97 reference = args.branch 98 elif not has_branch and has_commit: 99 source += args.commit_id + "/" 100 reference = args.commit_id 101 elif has_branch: # and has_commit 102 error("Cannot set both branch and commit_id") 103 else: # not has_branch and not has_commit 104 error("Must set one of branch, commit_id") 105 106 if has_commit and export_diff: 107 error("Cannot export diff between two commits.") 108 109 now = datetime.utcfromtimestamp(time.time()) 110 status_file_name_base = (f"EXPORT_{reference}_" 111 f"{now.strftime('%d-%m-%Y_%H:%M:%S')}") 112 113 rclone_command = "sync" if export_diff else "copy" 114 cmd = ["rclone", rclone_command, source, args.Dest, "--config=rclone.conf", 115 "--create-empty-src-dirs"] 116 117 process = subprocess.run(cmd) 118 if process.returncode != 0: 119 error(f"rclone {rclone_command} failed") 120 121 # check export and create status file 122 check_cmd = ["rclone", "check", source, args.Dest, 123 "--config=rclone.conf", "--combined=-"] 124 # if not export_diff: 125 # # Use the flag --one-way to check a copy command: only need to check 126 # # that the source files were copied to the destination 127 # check_cmd.append("--one-way") 128 129 check_process = subprocess.Popen(check_cmd, stdout=subprocess.PIPE, 130 text=True) 131 132 local_status = NamedTemporaryFile( 133 prefix="lakefs_export_status_", suffix=".temp", 134 mode="w", delete=False) 135 try: 136 # local_status cannot be re-opened for read on Windows until we 137 # close it for writing. 138 process_output(local_status, check_process.stdout) 139 140 # rclone writes until its done, check_process.stdout is closed, so 141 # the rclone process ended and wait will not block. 142 check_process.wait() 143 144 # Upload status file to destination bucket 145 success = check_process.returncode == 0 146 if success: 147 print(SUCCESS_MSG, file=local_status) 148 local_status.close() 149 150 status_file_name = (f"{status_file_name_base}_" 151 f"{'SUCCESS' if success else 'FAILURE'}") 152 dest_path = posixpath.join(args.Dest, status_file_name) 153 154 upload_process = subprocess.run(["rclone", "copyto", local_status.name, 155 dest_path, "--config=rclone.conf"]) 156 if upload_process.returncode != 0: 157 print("Failed to upload status file", file=sys.stderr) 158 finally: 159 os.remove(local_status.name) 160 161 if not success: 162 exit(1) 163 164 165 if __name__ == '__main__': 166 main()