github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/tools/bigtable-backup/bigtable-backup.py (about)

     1  import argparse
     2  import subprocess
     3  import time
     4  import json
     5  
     6  from datetime import datetime, timedelta
     7  import pytz
     8  from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
     9  
    10  registry = CollectorRegistry()
    11  bigtable_backup_job_last_run_seconds = Gauge('bigtable_backup_job_last_run_seconds', 'Last time a bigtable backup job ran at.', registry=registry)
    12  bigtable_backup_job_last_success_seconds = Gauge('bigtable_backup_job_last_success_seconds', 'Last time a bigtable backup job successfully finished.', registry=registry)
    13  bigtable_backup_job_runtime_seconds =  Gauge('bigtable_backup_job_runtime_seconds', 'Runtime of last successfully finished bigtable backup job.', registry=registry)
    14  bigtable_backup_job_backups_created = Gauge('bigtable_backup_job_backups_created', 'Number of backups created during last run.', registry=registry)
    15  bigtable_backup_job_last_active_table_backup_time_seconds = Gauge('bigtable_backup_job_last_active_table_backup_time_seconds', 'Last time an active table was backed up at.', registry=registry)
    16  
    17  job_backup_active_periodic_table = "backup-active-periodic-table"
    18  job_ensure_backups = "ensure-backups"
    19  
    20  def secs_to_periodic_table_number(periodic_secs):
    21      return time.time() / periodic_secs
    22  
    23  
    24  def backup_active_periodic_table(args):
    25      push_job_started_metric(args.prom_push_gateway_endpoint, args.namespace, job_backup_active_periodic_table)
    26      start_time = time.time()
    27  
    28      table_id = args.bigtable_table_id_prefix + str(int(time.time() / args.periodic_table_duration))
    29      create_backup(table_id, args)
    30  
    31      bigtable_backup_job_last_active_table_backup_time_seconds.set_to_current_time()
    32      push_job_finished_metric(args.prom_push_gateway_endpoint, args.namespace, job_backup_active_periodic_table, int(time.time() - start_time))
    33  
    34  
    35  def ensure_backups(args):
    36      push_job_started_metric(args.prom_push_gateway_endpoint, args.namespace, job_ensure_backups)
    37      start_time = time.time()
    38  
    39      if (args.duration == None and args.period_from == None) or (args.duration != None and args.period_from != None):
    40          raise ValueError("Either of --duration or --periodic-table-duration must be set")
    41  
    42      backups = list_backups(args.destination_path)
    43  
    44      if args.period_from == None:
    45          period_from = datetime.utcnow() - timedelta(seconds=args.duration)
    46          args.period_from = valid_date(period_from.strftime("%Y-%m-%d"))
    47          args.period_to = valid_date(datetime.utcnow().strftime("%Y-%m-%d"))
    48  
    49      oldest_table_number = int(args.period_from.timestamp() / args.periodic_table_duration)
    50      newest_table_number = int(args.period_to.timestamp() / args.periodic_table_duration)
    51      active_table_number = int(time.time() / args.periodic_table_duration)
    52  
    53      print("Checking right backups exist")
    54      table_number_to_check = oldest_table_number
    55      while table_number_to_check <= newest_table_number:
    56          table_id = args.bigtable_table_id_prefix + str(table_number_to_check)
    57          table_number_to_check += 1
    58          if table_id not in backups:
    59              print("backup for {} not found".format(table_id))
    60              create_backup(table_id, args)
    61          if table_id == active_table_number:
    62              bigtable_backup_job_last_active_table_backup_time_seconds.set_to_current_time()
    63  
    64      num_backups_deleted = 0
    65  
    66      print("Checking whether all the backups are created after their period is over")
    67      for table_id, timestamps in backups.items():
    68          table_number = int(table_id.rsplit("_", 1)[-1])
    69          last_timestamp_from_table_number = find_last_timestamp_from_table_number(table_number,
    70                                                                                   args.periodic_table_duration)
    71  
    72          # Checking whether backup is created after last timestamp of tables period.
    73          if last_timestamp_from_table_number > timestamps[-1]:
    74              create_backup(table_id, args)
    75  
    76      # list backups again to consider for deletion of unwanted backups since new backups might have been created above
    77      backups = list_backups(args.destination_path)
    78  
    79      print("Deleting old unwanted backups")
    80      for table_id, timestamps in backups.items():
    81          table_number = int(table_id.rsplit("_", 1)[-1])
    82  
    83          # Retain only most recent backup for non active table
    84          if table_number != active_table_number and len(timestamps) > 1:
    85              for timestamp in timestamps[:-1]:
    86                  delete_backup(table_id, str(timestamp), args)
    87                  num_backups_deleted += 1
    88  
    89      if args.delete_out_of_range_backups:
    90          num_backups_deleted += delete_out_of_range_backups(oldest_table_number, newest_table_number, backups, args)
    91  
    92      set_ensure_backups_specific_metrics(args, num_backups_deleted, active_table_number)
    93      push_job_finished_metric(args.prom_push_gateway_endpoint, args.namespace, job_ensure_backups, int(time.time() - start_time))
    94  
    95  def find_last_timestamp_from_table_number(table_number, periodic_secs):
    96      return ((table_number + 1) * periodic_secs) - 1
    97  
    98  def list_backups(backup_path):
    99      popen = subprocess.Popen(['bigtable-backup', 'list-backups', '-ojson', '--backup-path', backup_path],
   100                               stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
   101      popen.wait()
   102  
   103      return json.loads(popen.stdout.readline())
   104  
   105  def set_ensure_backups_specific_metrics(args, num_backups_deleted, active_table_number):
   106      # ensure-backups job specific metrics
   107      bigtable_backup_job_tables_backed_up = Gauge('bigtable_backup_job_tables_backed_up', 'Number of active and inactive tables backed up.', ['kind'], registry=registry)
   108      bigtable_backup_job_backups = Gauge('bigtable_backup_job_backups', 'Number of backups for all active and inactive tables.', ['kind'], registry=registry)
   109      bigtable_backup_job_backups_deleted = Gauge('bigtable_backup_job_backups_deleted', 'Number of backups deleted during last run.', registry=registry)
   110      bigtable_backup_job_expected_inactive_table_backups = Gauge('bigtable_backup_job_expected_inactive_table_backups', 'Expected number of backups for inactive tables.', registry=registry)
   111  
   112      duration = args.duration
   113      if args.duration == None:
   114          duration = (args.period_to - args.period_from).seconds
   115  
   116      # there should be 1 backup per inactive table
   117      bigtable_backup_job_expected_inactive_table_backups.set(int(duration/args.periodic_table_duration))
   118  
   119      bigtable_backup_job_backups_deleted.set(num_backups_deleted)
   120  
   121      backups = list_backups(args.destination_path)
   122      inactive_table_backups_count = 0
   123  
   124      # setting sum of number of backups per table
   125      for table_id, timestamps in backups.items():
   126          table_number = int(table_id.rsplit("_", 1)[-1])
   127  
   128          label = 'active'
   129          if active_table_number != table_number:
   130              label = 'inactive'
   131              inactive_table_backups_count += 1
   132  
   133          bigtable_backup_job_backups.labels(label).inc(len(timestamps))
   134  
   135      bigtable_backup_job_tables_backed_up.labels('inactive').set(inactive_table_backups_count)
   136      if len(backups) != inactive_table_backups_count:
   137          bigtable_backup_job_tables_backed_up.labels('active').set(1)
   138  
   139  def valid_date(s):
   140      try:
   141          dt = datetime.utcnow().strptime(s, "%Y-%m-%d")
   142          utc = pytz.timezone('UTC')
   143          return utc.localize(dt)
   144      except ValueError:
   145          msg = "Not a valid date: '{0}'.".format(s)
   146          raise argparse.ArgumentTypeError(msg)
   147  
   148  
   149  def valid_duration(s):
   150      try:
   151          return int(s) * 3600
   152      except ValueError:
   153          msg = "Not a valid duration: '{0}'.".format(s)
   154          raise argparse.ArgumentTypeError(msg)
   155  
   156  
   157  def valid_table_id_prefix(s):
   158      if not str(s).endswith("_"):
   159          return str(s) + "_"
   160  
   161  
   162  def create_backup(table_id, args):
   163      popen = subprocess.Popen(['bigtable-backup', 'create', '--bigtable-table-id-prefix', table_id,
   164                                '--temp-prefix', args.temp_prefix, '--bigtable-project-id', args.bigtable_project_id,
   165                                '--bigtable-instance-id', args.bigtable_instance_id, '--destination-path',
   166                                args.destination_path],
   167                               stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
   168      popen.wait()
   169      if popen.returncode != 0:
   170          raise Exception("Failed to create backup with error {}".format(b"".join(popen.stdout.readlines()).decode()))
   171      else:
   172          print("Backup created for table {}".format(table_id))
   173          bigtable_backup_job_backups_created.inc(1)
   174  
   175  def delete_backup(table_id, timestamp, args):
   176      popen = subprocess.Popen(['bigtable-backup', 'delete-backup', '--bigtable-table-id', table_id,
   177                                '--backup-path', args.destination_path, "--backup-timestamp", timestamp],
   178                               stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
   179      popen.wait()
   180      if popen.returncode != 0:
   181          raise Exception("Failed to delete backup with error {}".format(b"".join(popen.stdout.readlines()).decode()))
   182      else:
   183          print(popen.stdout.readlines())
   184  
   185  def push_job_started_metric(endpoint, namespace, job):
   186      try:
   187          bigtable_backup_job_last_run_seconds.set_to_current_time()
   188          push_to_gateway(endpoint, job="{}/{}".format(namespace, job), registry=registry)
   189      except Exception as e:
   190          print("failed to push metrics with error {}".format(e))
   191  
   192  def push_job_finished_metric(endpoint, namespace, job, runtime):
   193      try:
   194          bigtable_backup_job_last_success_seconds.set_to_current_time()
   195          bigtable_backup_job_runtime_seconds.set(runtime)
   196          push_to_gateway(endpoint, job="{}/{}".format(namespace, job), registry=registry)
   197      except Exception as e:
   198          print("failed to push metrics with error {}".format(e))
   199  
   200  def push_metrics(endpoint, namespace, job):
   201      try:
   202          push_to_gateway(endpoint, job="{}/{}".format(namespace, job), registry=registry)
   203      except Exception as e:
   204          print("failed to push metrics with error {}".format(e))
   205  
   206  def delete_out_of_range_backups(oldest_table_number, newest_table_number, backups, args):
   207      num_backups_deleted = 0
   208      for table_id, timestamps in backups.items():
   209          table_number = int(table_id.rsplit("_", 1)[-1])
   210          if table_number < oldest_table_number or table_number > newest_table_number:
   211              for timestamp in timestamps:
   212                  delete_backup(table_id, str(timestamp), args)
   213                  num_backups_deleted += 1
   214  
   215      return num_backups_deleted
   216  
   217  
   218  def main():
   219      parser = argparse.ArgumentParser()
   220      subparser = parser.add_subparsers(help="commands")
   221      parser.add_argument("--bigtable-project-id", required=True,
   222                          help="The ID of the GCP project of the Cloud Bigtable instance")
   223      parser.add_argument("--bigtable-instance-id", required=True,
   224                          help="The ID of the Cloud Bigtable instance that contains the tables")
   225      parser.add_argument("--bigtable-table-id-prefix", required=True, type=valid_table_id_prefix,
   226                          help="Prefix to build IDs of the tables using periodic-table-duration")
   227      parser.add_argument("--destination-path", required=True,
   228                          help="GCS path where data should be written. For example, gs://mybucket/somefolder/")
   229      parser.add_argument("--temp-prefix", required=True,
   230                          help="Path and filename prefix for writing temporary files. ex: gs://MyBucket/tmp")
   231      parser.add_argument("--periodic-table-duration", required=True, type=valid_duration,
   232                          help="Periodic config set for loki tables in hours")
   233      parser.add_argument("--prom-push-gateway-endpoint", default="localhost:9091", help="Endpoint where metrics are to be pushed")
   234      parser.add_argument("--namespace", default="default", help="namespace while reporting metrics")
   235  
   236      backup_active_periodic_table_parser = subparser.add_parser(job_backup_active_periodic_table,
   237                                                                 help="Backup active periodic table")
   238      backup_active_periodic_table_parser.set_defaults(func=backup_active_periodic_table)
   239  
   240      ensure_backups_parser = subparser.add_parser(job_ensure_backups,
   241                                                                 help="Ensure backups of right tables exist")
   242      ensure_backups_parser.add_argument('--duration', help="Duration in hours for which backups should exist. "
   243                                                            "Must not be set with --period-from and --period-to", type=valid_duration)
   244      ensure_backups_parser.add_argument('--period-from', type=valid_date, help="Backups should exist starting from the date. Must not be set with --duration")
   245      ensure_backups_parser.add_argument('--period-to', type=valid_date,
   246                                                       default=datetime.utcnow().strftime("%Y-%m-%d"))
   247      ensure_backups_parser.add_argument('--delete-out-of-range-backups', help="Delete backups which are out of range of duration for which backups are being ensured",
   248                                         default=False)
   249      ensure_backups_parser.set_defaults(func=ensure_backups)
   250  
   251      args = parser.parse_args()
   252  
   253      args.func(args)
   254  
   255  
   256  if __name__ == "__main__":
   257      main()