github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/tools/bigtable-backup/bigtable-backup.py (about) 1 import argparse 2 import subprocess 3 import time 4 import json 5 6 from datetime import datetime, timedelta 7 import pytz 8 from prometheus_client import CollectorRegistry, Gauge, push_to_gateway 9 10 registry = CollectorRegistry() 11 bigtable_backup_job_last_run_seconds = Gauge('bigtable_backup_job_last_run_seconds', 'Last time a bigtable backup job ran at.', registry=registry) 12 bigtable_backup_job_last_success_seconds = Gauge('bigtable_backup_job_last_success_seconds', 'Last time a bigtable backup job successfully finished.', registry=registry) 13 bigtable_backup_job_runtime_seconds = Gauge('bigtable_backup_job_runtime_seconds', 'Runtime of last successfully finished bigtable backup job.', registry=registry) 14 bigtable_backup_job_backups_created = Gauge('bigtable_backup_job_backups_created', 'Number of backups created during last run.', registry=registry) 15 bigtable_backup_job_last_active_table_backup_time_seconds = Gauge('bigtable_backup_job_last_active_table_backup_time_seconds', 'Last time an active table was backed up at.', registry=registry) 16 17 job_backup_active_periodic_table = "backup-active-periodic-table" 18 job_ensure_backups = "ensure-backups" 19 20 def secs_to_periodic_table_number(periodic_secs): 21 return time.time() / periodic_secs 22 23 24 def backup_active_periodic_table(args): 25 push_job_started_metric(args.prom_push_gateway_endpoint, args.namespace, job_backup_active_periodic_table) 26 start_time = time.time() 27 28 table_id = args.bigtable_table_id_prefix + str(int(time.time() / args.periodic_table_duration)) 29 create_backup(table_id, args) 30 31 bigtable_backup_job_last_active_table_backup_time_seconds.set_to_current_time() 32 push_job_finished_metric(args.prom_push_gateway_endpoint, args.namespace, job_backup_active_periodic_table, int(time.time() - start_time)) 33 34 35 def ensure_backups(args): 36 push_job_started_metric(args.prom_push_gateway_endpoint, args.namespace, job_ensure_backups) 37 start_time = time.time() 38 39 if (args.duration == None and args.period_from == None) or (args.duration != None and args.period_from != None): 40 raise ValueError("Either of --duration or --periodic-table-duration must be set") 41 42 backups = list_backups(args.destination_path) 43 44 if args.period_from == None: 45 period_from = datetime.utcnow() - timedelta(seconds=args.duration) 46 args.period_from = valid_date(period_from.strftime("%Y-%m-%d")) 47 args.period_to = valid_date(datetime.utcnow().strftime("%Y-%m-%d")) 48 49 oldest_table_number = int(args.period_from.timestamp() / args.periodic_table_duration) 50 newest_table_number = int(args.period_to.timestamp() / args.periodic_table_duration) 51 active_table_number = int(time.time() / args.periodic_table_duration) 52 53 print("Checking right backups exist") 54 table_number_to_check = oldest_table_number 55 while table_number_to_check <= newest_table_number: 56 table_id = args.bigtable_table_id_prefix + str(table_number_to_check) 57 table_number_to_check += 1 58 if table_id not in backups: 59 print("backup for {} not found".format(table_id)) 60 create_backup(table_id, args) 61 if table_id == active_table_number: 62 bigtable_backup_job_last_active_table_backup_time_seconds.set_to_current_time() 63 64 num_backups_deleted = 0 65 66 print("Checking whether all the backups are created after their period is over") 67 for table_id, timestamps in backups.items(): 68 table_number = int(table_id.rsplit("_", 1)[-1]) 69 last_timestamp_from_table_number = find_last_timestamp_from_table_number(table_number, 70 args.periodic_table_duration) 71 72 # Checking whether backup is created after last timestamp of tables period. 73 if last_timestamp_from_table_number > timestamps[-1]: 74 create_backup(table_id, args) 75 76 # list backups again to consider for deletion of unwanted backups since new backups might have been created above 77 backups = list_backups(args.destination_path) 78 79 print("Deleting old unwanted backups") 80 for table_id, timestamps in backups.items(): 81 table_number = int(table_id.rsplit("_", 1)[-1]) 82 83 # Retain only most recent backup for non active table 84 if table_number != active_table_number and len(timestamps) > 1: 85 for timestamp in timestamps[:-1]: 86 delete_backup(table_id, str(timestamp), args) 87 num_backups_deleted += 1 88 89 if args.delete_out_of_range_backups: 90 num_backups_deleted += delete_out_of_range_backups(oldest_table_number, newest_table_number, backups, args) 91 92 set_ensure_backups_specific_metrics(args, num_backups_deleted, active_table_number) 93 push_job_finished_metric(args.prom_push_gateway_endpoint, args.namespace, job_ensure_backups, int(time.time() - start_time)) 94 95 def find_last_timestamp_from_table_number(table_number, periodic_secs): 96 return ((table_number + 1) * periodic_secs) - 1 97 98 def list_backups(backup_path): 99 popen = subprocess.Popen(['bigtable-backup', 'list-backups', '-ojson', '--backup-path', backup_path], 100 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 101 popen.wait() 102 103 return json.loads(popen.stdout.readline()) 104 105 def set_ensure_backups_specific_metrics(args, num_backups_deleted, active_table_number): 106 # ensure-backups job specific metrics 107 bigtable_backup_job_tables_backed_up = Gauge('bigtable_backup_job_tables_backed_up', 'Number of active and inactive tables backed up.', ['kind'], registry=registry) 108 bigtable_backup_job_backups = Gauge('bigtable_backup_job_backups', 'Number of backups for all active and inactive tables.', ['kind'], registry=registry) 109 bigtable_backup_job_backups_deleted = Gauge('bigtable_backup_job_backups_deleted', 'Number of backups deleted during last run.', registry=registry) 110 bigtable_backup_job_expected_inactive_table_backups = Gauge('bigtable_backup_job_expected_inactive_table_backups', 'Expected number of backups for inactive tables.', registry=registry) 111 112 duration = args.duration 113 if args.duration == None: 114 duration = (args.period_to - args.period_from).seconds 115 116 # there should be 1 backup per inactive table 117 bigtable_backup_job_expected_inactive_table_backups.set(int(duration/args.periodic_table_duration)) 118 119 bigtable_backup_job_backups_deleted.set(num_backups_deleted) 120 121 backups = list_backups(args.destination_path) 122 inactive_table_backups_count = 0 123 124 # setting sum of number of backups per table 125 for table_id, timestamps in backups.items(): 126 table_number = int(table_id.rsplit("_", 1)[-1]) 127 128 label = 'active' 129 if active_table_number != table_number: 130 label = 'inactive' 131 inactive_table_backups_count += 1 132 133 bigtable_backup_job_backups.labels(label).inc(len(timestamps)) 134 135 bigtable_backup_job_tables_backed_up.labels('inactive').set(inactive_table_backups_count) 136 if len(backups) != inactive_table_backups_count: 137 bigtable_backup_job_tables_backed_up.labels('active').set(1) 138 139 def valid_date(s): 140 try: 141 dt = datetime.utcnow().strptime(s, "%Y-%m-%d") 142 utc = pytz.timezone('UTC') 143 return utc.localize(dt) 144 except ValueError: 145 msg = "Not a valid date: '{0}'.".format(s) 146 raise argparse.ArgumentTypeError(msg) 147 148 149 def valid_duration(s): 150 try: 151 return int(s) * 3600 152 except ValueError: 153 msg = "Not a valid duration: '{0}'.".format(s) 154 raise argparse.ArgumentTypeError(msg) 155 156 157 def valid_table_id_prefix(s): 158 if not str(s).endswith("_"): 159 return str(s) + "_" 160 161 162 def create_backup(table_id, args): 163 popen = subprocess.Popen(['bigtable-backup', 'create', '--bigtable-table-id-prefix', table_id, 164 '--temp-prefix', args.temp_prefix, '--bigtable-project-id', args.bigtable_project_id, 165 '--bigtable-instance-id', args.bigtable_instance_id, '--destination-path', 166 args.destination_path], 167 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 168 popen.wait() 169 if popen.returncode != 0: 170 raise Exception("Failed to create backup with error {}".format(b"".join(popen.stdout.readlines()).decode())) 171 else: 172 print("Backup created for table {}".format(table_id)) 173 bigtable_backup_job_backups_created.inc(1) 174 175 def delete_backup(table_id, timestamp, args): 176 popen = subprocess.Popen(['bigtable-backup', 'delete-backup', '--bigtable-table-id', table_id, 177 '--backup-path', args.destination_path, "--backup-timestamp", timestamp], 178 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 179 popen.wait() 180 if popen.returncode != 0: 181 raise Exception("Failed to delete backup with error {}".format(b"".join(popen.stdout.readlines()).decode())) 182 else: 183 print(popen.stdout.readlines()) 184 185 def push_job_started_metric(endpoint, namespace, job): 186 try: 187 bigtable_backup_job_last_run_seconds.set_to_current_time() 188 push_to_gateway(endpoint, job="{}/{}".format(namespace, job), registry=registry) 189 except Exception as e: 190 print("failed to push metrics with error {}".format(e)) 191 192 def push_job_finished_metric(endpoint, namespace, job, runtime): 193 try: 194 bigtable_backup_job_last_success_seconds.set_to_current_time() 195 bigtable_backup_job_runtime_seconds.set(runtime) 196 push_to_gateway(endpoint, job="{}/{}".format(namespace, job), registry=registry) 197 except Exception as e: 198 print("failed to push metrics with error {}".format(e)) 199 200 def push_metrics(endpoint, namespace, job): 201 try: 202 push_to_gateway(endpoint, job="{}/{}".format(namespace, job), registry=registry) 203 except Exception as e: 204 print("failed to push metrics with error {}".format(e)) 205 206 def delete_out_of_range_backups(oldest_table_number, newest_table_number, backups, args): 207 num_backups_deleted = 0 208 for table_id, timestamps in backups.items(): 209 table_number = int(table_id.rsplit("_", 1)[-1]) 210 if table_number < oldest_table_number or table_number > newest_table_number: 211 for timestamp in timestamps: 212 delete_backup(table_id, str(timestamp), args) 213 num_backups_deleted += 1 214 215 return num_backups_deleted 216 217 218 def main(): 219 parser = argparse.ArgumentParser() 220 subparser = parser.add_subparsers(help="commands") 221 parser.add_argument("--bigtable-project-id", required=True, 222 help="The ID of the GCP project of the Cloud Bigtable instance") 223 parser.add_argument("--bigtable-instance-id", required=True, 224 help="The ID of the Cloud Bigtable instance that contains the tables") 225 parser.add_argument("--bigtable-table-id-prefix", required=True, type=valid_table_id_prefix, 226 help="Prefix to build IDs of the tables using periodic-table-duration") 227 parser.add_argument("--destination-path", required=True, 228 help="GCS path where data should be written. For example, gs://mybucket/somefolder/") 229 parser.add_argument("--temp-prefix", required=True, 230 help="Path and filename prefix for writing temporary files. ex: gs://MyBucket/tmp") 231 parser.add_argument("--periodic-table-duration", required=True, type=valid_duration, 232 help="Periodic config set for loki tables in hours") 233 parser.add_argument("--prom-push-gateway-endpoint", default="localhost:9091", help="Endpoint where metrics are to be pushed") 234 parser.add_argument("--namespace", default="default", help="namespace while reporting metrics") 235 236 backup_active_periodic_table_parser = subparser.add_parser(job_backup_active_periodic_table, 237 help="Backup active periodic table") 238 backup_active_periodic_table_parser.set_defaults(func=backup_active_periodic_table) 239 240 ensure_backups_parser = subparser.add_parser(job_ensure_backups, 241 help="Ensure backups of right tables exist") 242 ensure_backups_parser.add_argument('--duration', help="Duration in hours for which backups should exist. " 243 "Must not be set with --period-from and --period-to", type=valid_duration) 244 ensure_backups_parser.add_argument('--period-from', type=valid_date, help="Backups should exist starting from the date. Must not be set with --duration") 245 ensure_backups_parser.add_argument('--period-to', type=valid_date, 246 default=datetime.utcnow().strftime("%Y-%m-%d")) 247 ensure_backups_parser.add_argument('--delete-out-of-range-backups', help="Delete backups which are out of range of duration for which backups are being ensured", 248 default=False) 249 ensure_backups_parser.set_defaults(func=ensure_backups) 250 251 args = parser.parse_args() 252 253 args.func(args) 254 255 256 if __name__ == "__main__": 257 main()