github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/examples/hooks/s3_hive_manifest_exporter.lua (about)

     1  --[[
     2  Automatic Symlink Exporter
     3  
     4  Args:
     5   - aws_access_key_id, aws_secret_access_key, aws_region (string): configuration passed to the s3 client that writes symlinks
     6   - export_bucket (string): bucket to write symlinks to
     7   - export_path (string): path in the bucket to write symlinks to
     8   - sources ([]string): lakeFS paths that should be written as symlinks
     9  
    10  Example hook declaration: (_lakefs_actions/auto-symlinks.yaml):
    11  
    12  name: auto symlink
    13  on:
    14    post-create-branch:
    15      branches: ["view-*"]
    16    post-commit:
    17      branches: ["view-*"]
    18  hooks:
    19    - id: symlink_creator
    20      type: lua
    21      properties:
    22        script_path: scripts/s3_hive_manifest_exporter.lua
    23        args:
    24          # Export configuration
    25          aws_access_key_id: "AKIA..."
    26          aws_secret_access_key: "..."
    27          aws_region: us-east-1
    28          export_bucket: oz-repo
    29          export_path: lakefs_tables
    30          sources:
    31            - tables/my-table/
    32  ]]
    33  
    34  aws = require("aws")
    35  lakefs = require("lakefs")
    36  path = require("path")
    37  path_sep = path.default_separator()
    38  
    39  s3 = aws.s3_client(args.aws_access_key_id, args.aws_secret_access_key, args.aws_region)
    40  
    41  tag_events = {  ["pre-create-tag"] = true,  ["post-create-tag"] = true }
    42  branch_events = {  ["pre-create-branch"] = true,  ["post-create-branch"] = true }
    43  commit_events = {  ["post-commit"] = true, ["post-merge"] = true }
    44  
    45  local current_commit = action.commit_id
    46  local ref
    47  if tag_events[action.event_type] then
    48      ref = action.tag_id
    49  elseif branch_events[action.event_type] then
    50      ref = action.branch_id
    51  elseif commit_events[action.event_type] then
    52      ref = action.branch_id
    53  else
    54      error("unsupported event type: " .. action.event_type)
    55  end
    56  -- root export path for the current repository
    57  export_path = path.join(path_sep, args.export_path, "repositories", action.repository_id)
    58  
    59  for _, location in ipairs(args.sources) do
    60      location_export_path = path.join(path_sep, export_path, "refs", ref, location)
    61      start_marker = path.join(path_sep, location_export_path, "_start_commit_id")
    62      end_marker = path.join(path_sep, location_export_path, "_completed_commit_id")
    63      -- read start_commit from S3
    64      start_commit, exists = s3.get_object(args.export_bucket, start_marker)
    65      if not exists then
    66          -- no commit marker
    67          print("no _start_commit_id found for location '" .. location .. "'")
    68          start_commit = nil
    69      end
    70      -- read end_commit from S3
    71      end_commit, exists = s3.get_object(args.export_bucket, end_marker)
    72      if not exists then
    73          -- no commit marker
    74          print("no _completed_commit_id found for location '" .. location .. "'")
    75          end_commit = nil
    76      end
    77  
    78      clean_mode = false
    79      if (not start_commit or not end_commit) or (start_commit ~= end_commit) then
    80          -- we need to clean up and start from scratch
    81          print("going into clean mode for location '" .. location .. "', deleting export path s3://" .. args.export_bucket .. "/" .. location_export_path)
    82          s3.delete_recursive(args.export_bucket, location_export_path)
    83          clean_mode = true
    84      end
    85      -- write start_commit
    86      print("writing _start_commit_id: " .. current_commit)
    87      s3.put_object(args.export_bucket, start_marker, current_commit)
    88  
    89      if clean_mode then
    90          -- instead of diffing, list the things and gather prefixes
    91          local after = ""
    92          local has_more = true
    93          local current_subloc = ""
    94          local current_files = {}
    95          while has_more do
    96              local code, resp = lakefs.list_objects(action.repository_id, current_commit, after, location, "") -- without delimiter
    97              if code ~= 200 then
    98                  error("could not list path: " .. location .. ", error: " .. resp.message)
    99              end
   100              for _, entry in ipairs(resp.results) do
   101                  p = path.parse(entry.path)
   102                  -- did we move on to the next dir?
   103                  if p.parent ~= current_subloc then
   104                      -- we moved on to a new directory! let's flush the previous one
   105                      if #current_files > 0 then
   106                          symlink_path = path.join(path_sep, location_export_path, current_subloc, "symlink.txt")
   107                          print("writing symlink file for " .. symlink_path)
   108                          s3.put_object(args.export_bucket, symlink_path, table.concat(current_files, "\n"))
   109                      end
   110                      -- done, updated current dir
   111                      current_subloc = p.parent
   112                      current_files = {}
   113                  end
   114                  -- add physical address
   115                  if not path.is_hidden(entry.path) then
   116                      table.insert(current_files, entry.physical_address)
   117                  end
   118  
   119              end
   120  
   121              -- pagination
   122              has_more = resp.pagination.has_more
   123              after = resp.pagination.next_offset
   124          end
   125          -- do we have anything left to flush?
   126          if #current_files > 0 then
   127              symlink_path = path.join(path_sep, location_export_path, current_subloc, "symlink.txt")
   128              print("writing symlink file for " .. symlink_path)
   129              s3.put_object(args.export_bucket, symlink_path, table.concat(current_files, "\n"))
   130          end
   131      else
   132          -- diff start_commit with current_commit
   133          dirty_locations = {}
   134          local has_more = true
   135          local after = ""
   136          while has_more do
   137              print("diffing. current commit = " .. current_commit .. ", start commit = " .. start_commit .. ", after = " .. after .. ", location = " .. location)
   138              local code, resp = lakefs.diff_refs(action.repository_id, start_commit, current_commit, after, location, "") -- recursive
   139              if code ~= 200 then
   140                  error("could not diff path: " .. location .. ", error: " .. resp.message)
   141              end
   142              -- for every modified_prefix
   143              print("\t got " .. tostring(#resp.results) .. " results, iterating")
   144              for _, entry in ipairs(resp.results) do
   145                  p = path.parse(entry.path)
   146                  if dirty_locations[#dirty_locations] ~= p.parent then
   147                      print("adding 'dirty' location: " .. p.parent)
   148                      table.insert(dirty_locations, p.parent)
   149                  end
   150              end
   151              -- pagination
   152              has_more = resp.pagination.has_more
   153              after = resp.pagination.next_offset
   154          end
   155  
   156          -- now, for every dirty location, regenerate its symlink
   157          for _, subloc in ipairs(dirty_locations) do
   158              local has_more = true
   159              local after = ""
   160              local current_entries = {}
   161              while has_more do
   162                  local code, resp = lakefs.list_objects(action.repository_id, current_commit, after, subloc, "") -- without delimiter
   163                  if code ~= 200 then
   164                      error("could not list path: " .. subloc .. ", error: " .. resp.message)
   165                  end
   166                  for _, entry in ipairs(resp.results) do
   167                      if not path.is_hidden(entry.path) then
   168                          table.insert(current_entries, entry.physical_address)
   169                      end
   170                  end
   171                  -- pagination
   172                  has_more = resp.pagination.has_more
   173                  after = resp.pagination.next_offset
   174              end
   175              symlink_path = path.join(path_sep, location_export_path, subloc, "symlink.txt")
   176              if #current_entries == 0 then
   177                  print("removing stale symlink path: " .. symlink_path)
   178                  s3.delete_object(args.export_bucket, symlink_path)
   179              else
   180                  print("writing symlink path: " .. symlink_path)
   181                  s3.put_object(args.export_bucket, symlink_path, table.concat(current_entries, "\n"))
   182              end
   183          end
   184  
   185      end
   186      -- done with location! write end_marker
   187      s3.put_object(args.export_bucket, end_marker, current_commit)
   188      print("done! wrote _completed_commit_id: " .. current_commit)
   189  end