github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/lua/lakefs/catalogexport/hive.lua (about)

     1  local pathlib = require("path")
     2  local utils = require("lakefs/catalogexport/internal")
     3  local strings = require("strings")
     4  local DEFAULT_PAGE_SIZE = 30
     5  
     6  -- extract partition prefix from full path
     7  local function extract_partitions_path(partitions, path)
     8      if partitions == nil or #partitions == 0 then
     9          return ""
    10      end
    11      local idx = 1
    12      local is_partition_prefix = strings.has_prefix(path, partitions[1])
    13      for part_idx, partition in ipairs(partitions) do
    14          local col_substr = "/" .. partition .. "="
    15          -- if partition is the path prefix and we are the that first partition remove /
    16          if part_idx == 1 and is_partition_prefix then
    17              col_substr = partition .. "="
    18          end
    19          local i, j = string.find(path, col_substr, idx)
    20          if i == nil then
    21              return nil
    22          end
    23          local separator_idx = string.find(path, "/", j + 1)
    24          -- verify / found and there is something in between = ... / 
    25          if separator_idx == nil or separator_idx <= (j + 1) then
    26              return nil
    27          end
    28          idx = separator_idx
    29      end
    30      return string.sub(path, 1, idx)
    31  end
    32  
    33  -- Hive format partition iterator each result set is a collection of files under the same partition
    34  local function extract_partition_pager(client, repo_id, commit_id, base_path, partition_cols, page_size)
    35      local target_partition = ""
    36      local pager = utils.lakefs_object_pager(client, repo_id, commit_id, "", base_path, "",
    37          page_size or DEFAULT_PAGE_SIZE)
    38      local page = pager()
    39      return function()
    40          if page == nil then
    41              return nil
    42          end
    43          local partition_entries = {}
    44          while true do
    45              if #page == 0 then
    46                  page = pager()
    47                  if page == nil then -- no more records
    48                      return target_partition, partition_entries
    49                  end
    50              end
    51              local entry = page[1]
    52              local partition_key = extract_partitions_path(partition_cols, entry.path)
    53              if not pathlib.is_hidden(entry.path) and partition_key ~= nil then
    54                  -- first time: if not set, assign current object partition as the target_partition key
    55                  if target_partition == "" then
    56                      target_partition = partition_key
    57                  end
    58                  -- break if current entry does not belong to the target_partition
    59                  if partition_key ~= target_partition then
    60                      local partition_result = target_partition
    61                      target_partition = partition_key
    62                      return partition_result, partition_entries
    63                  end
    64                  -- if entry is not a hadoop directory marker add the file to the result set
    65                  if not (entry.path == partition_key and entry.size_bytes == 0) then 
    66                      table.insert(partition_entries, {
    67                          physical_address = entry.physical_address,
    68                          path = entry.path,
    69                          size = entry.size_bytes,
    70                          checksum = entry.checksum
    71                      }) 
    72                  end
    73              end
    74              -- remove entry (if its part of current partition, hidden files etc) from the entry set
    75              table.remove(page, 1)
    76          end
    77      end
    78  end
    79  
    80  return {
    81      extract_partition_pager = extract_partition_pager
    82  }