github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/lua/lakefs/catalogexport/table_extractor.lua (about)

     1  local pathlib = require("path")
     2  local strings = require("strings")
     3  local yaml = require("encoding/yaml")
     4  local utils = require("lakefs/catalogexport/internal")
     5  
     6  local LAKEFS_TABLES_BASE = "_lakefs_tables/"
     7  
     8  -- check if lakefs entry is a table spec under _lakefs_tables/
     9  local function is_table_obj(entry, tables_base)
    10      if entry.path_type ~= "object" then
    11          return false
    12      end
    13      local path = entry.path
    14      if strings.has_prefix(path, tables_base) then
    15          -- remove _lakefs_tables/ from path
    16          path = entry.path:sub(#tables_base, #path)
    17      end
    18      return not pathlib.is_hidden(path) and strings.has_suffix(path, ".yaml")
    19  end
    20  
    21  -- list all YAML files under _lakefs_tables/*
    22  local function list_table_descriptor_entries(client, repo_id, commit_id)
    23      local table_entries = {}
    24      local page_size = 30
    25      local pager = utils.lakefs_object_pager(client, repo_id, commit_id, "", LAKEFS_TABLES_BASE,"", page_size)
    26      for entries in pager do
    27          for _, entry in ipairs(entries) do
    28              if is_table_obj(entry, LAKEFS_TABLES_BASE) then
    29                  table.insert(table_entries, {
    30                      physical_address = entry.physical_address,
    31                      path = entry.path
    32                  })
    33              end
    34          end
    35      end
    36      return table_entries
    37  end
    38  
    39  -- table as parsed YAML object
    40  local function get_table_descriptor(client, repo_id, commit_id, logical_path)
    41      local code, content = client.get_object(repo_id, commit_id, logical_path)
    42      if code ~= 200 then
    43          error("could not fetch data file: HTTP " .. tostring(code) .. " path: " .. logical_path)
    44      end
    45      local descriptor = yaml.unmarshal(content)
    46      descriptor.partition_columns = descriptor.partition_columns or {}
    47      return descriptor
    48  end
    49  
    50  return {
    51      list_table_descriptor_entries = list_table_descriptor_entries,
    52      get_table_descriptor = get_table_descriptor,
    53  }