github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/lua/lakefs/catalogexport/symlink_exporter.lua (about)

     1  local extractor = require("lakefs/catalogexport/table_extractor")
     2  local hive = require("lakefs/catalogexport/hive")
     3  local utils = require("lakefs/catalogexport/internal")
     4  local pathlib = require("path")
     5  local strings = require("strings")
     6  local lakefs = require("lakefs")
     7  
     8  --[[
     9      @repo_id: repository id
    10      @commit_id: commit id of the current table
    11      @table_src_path: path to table spec (i.e _lakefs_tables/my_table.yaml)
    12      @options:
    13      - skip_trim_obj_base_path(boolean) if true will skip removing the prefix path before the partition path.
    14  ]]
    15  local function export_it(repo_id, commit_id, table_src_path, options)
    16      local opts = options or {}
    17      local descriptor = extractor.get_table_descriptor(lakefs, repo_id, commit_id, table_src_path)
    18      if descriptor.type ~= "hive" then
    19          error("table " .. descriptor.type .. " in path " .. table_src_path .. " not supported")
    20      end
    21      if opts.debug then
    22          print(string.format('%s table `lakefs://%s/%s/%s`', descriptor.type, repo_id, utils.short_digest(commit_id),
    23              descriptor.path))
    24      end
    25      local base_path = descriptor.path
    26      local cols = descriptor.partition_columns
    27      local pager = hive.extract_partition_pager(lakefs, repo_id, commit_id, base_path, cols)
    28      return function()
    29          local part_key, entries = pager()
    30          if part_key == nil then
    31              return nil
    32          end
    33          local symlink_data = ""
    34          for _, entry in ipairs(entries) do
    35              symlink_data = symlink_data .. entry.physical_address .. "\n"
    36          end
    37          -- create key suffix for symlink file
    38          local storage_key_suffix = part_key
    39          if #descriptor.partition_columns == 0 then
    40              storage_key_suffix = descriptor.name .. "/" .. "symlink.txt"
    41          else
    42              if not opts.skip_trim_obj_base_path then
    43                  storage_key_suffix = strings.replace(part_key, base_path .. "/", "", 1) -- remove base_path prefix from partition path
    44              end
    45              -- append to partition path to suffix
    46              storage_key_suffix = pathlib.join("/", descriptor.name, storage_key_suffix, "symlink.txt")
    47          end
    48          return {
    49              key_suffix = storage_key_suffix,
    50              data = symlink_data
    51          }
    52      end
    53  end
    54  
    55  --[[
    56      export a Symlinks that represent a table to S3
    57      @s3_client: configured client
    58      @table_src_path: object path to the table spec (_lakefs_tables/my_table.yaml)
    59      @action_info: the global action object
    60      @options:
    61      - debug(boolean)
    62      - export_base_uri(string): override the prefix in S3 i.e s3://other-bucket/path/
    63      - writer(function(bucket, key, data)): if passed then will not use s3 client, helpful for debug
    64  ]]
    65  local function export_s3(s3_client, table_src_path, action_info, options)
    66      local opts = options or {}
    67      local repo_id = action_info.repository_id
    68      local commit_id = action_info.commit_id
    69      local base_prefix = opts.export_base_uri or action_info.storage_namespace
    70      local export_base_uri = utils.get_storage_uri_prefix(base_prefix, commit_id, action_info)
    71      local location = utils.parse_storage_uri(export_base_uri)
    72      local put_object = opts.writer or s3_client.put_object
    73      local it = export_it(repo_id, commit_id, table_src_path, opts)
    74      for symlink in it do
    75          local key = pathlib.join("/", location.key, symlink.key_suffix)
    76          if opts.debug then
    77              print("S3 writing bucket: " .. location.bucket .. " key: " .. key)
    78          end
    79          put_object(location.bucket, key, symlink.data)
    80      end
    81      return {
    82          location = location
    83      }
    84  end
    85  
    86  return {
    87      export_s3 = export_s3,
    88  }