github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/lua/lakefs/catalogexport/hive.lua (about) 1 local pathlib = require("path") 2 local utils = require("lakefs/catalogexport/internal") 3 local strings = require("strings") 4 local DEFAULT_PAGE_SIZE = 30 5 6 -- extract partition prefix from full path 7 local function extract_partitions_path(partitions, path) 8 if partitions == nil or #partitions == 0 then 9 return "" 10 end 11 local idx = 1 12 local is_partition_prefix = strings.has_prefix(path, partitions[1]) 13 for part_idx, partition in ipairs(partitions) do 14 local col_substr = "/" .. partition .. "=" 15 -- if partition is the path prefix and we are the that first partition remove / 16 if part_idx == 1 and is_partition_prefix then 17 col_substr = partition .. "=" 18 end 19 local i, j = string.find(path, col_substr, idx) 20 if i == nil then 21 return nil 22 end 23 local separator_idx = string.find(path, "/", j + 1) 24 -- verify / found and there is something in between = ... / 25 if separator_idx == nil or separator_idx <= (j + 1) then 26 return nil 27 end 28 idx = separator_idx 29 end 30 return string.sub(path, 1, idx) 31 end 32 33 -- Hive format partition iterator each result set is a collection of files under the same partition 34 local function extract_partition_pager(client, repo_id, commit_id, base_path, partition_cols, page_size) 35 local target_partition = "" 36 local pager = utils.lakefs_object_pager(client, repo_id, commit_id, "", base_path, "", 37 page_size or DEFAULT_PAGE_SIZE) 38 local page = pager() 39 return function() 40 if page == nil then 41 return nil 42 end 43 local partition_entries = {} 44 while true do 45 if #page == 0 then 46 page = pager() 47 if page == nil then -- no more records 48 return target_partition, partition_entries 49 end 50 end 51 local entry = page[1] 52 local partition_key = extract_partitions_path(partition_cols, entry.path) 53 if not pathlib.is_hidden(entry.path) and partition_key ~= nil then 54 -- first time: if not set, assign current object partition as the target_partition key 55 if target_partition == "" then 56 target_partition = partition_key 57 end 58 -- break if current entry does not belong to the target_partition 59 if partition_key ~= target_partition then 60 local partition_result = target_partition 61 target_partition = partition_key 62 return partition_result, partition_entries 63 end 64 -- if entry is not a hadoop directory marker add the file to the result set 65 if not (entry.path == partition_key and entry.size_bytes == 0) then 66 table.insert(partition_entries, { 67 physical_address = entry.physical_address, 68 path = entry.path, 69 size = entry.size_bytes, 70 checksum = entry.checksum 71 }) 72 end 73 end 74 -- remove entry (if its part of current partition, hidden files etc) from the entry set 75 table.remove(page, 1) 76 end 77 end 78 end 79 80 return { 81 extract_partition_pager = extract_partition_pager 82 }