github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/testdata/lua/catalogexport_delta.lua (about) 1 local pathlib = require("path") 2 local json = require("encoding/json") 3 local utils = require("lakefs/catalogexport/internal") 4 local strings = require("strings") 5 6 7 local test_data = { 8 --[[ 9 Used to mark the objects to which a stat_object request was issued. 10 { 11 "table_path1": { "file1.parquet" = true, "file2.parquet" = true, ...}, 12 "table_path2": { "file1.parquet" = true, "file2.parquet" = true, ...} 13 } 14 ]] 15 table_to_objects = {}, 16 --[[ 17 Used to validate the expected Delta Log content 18 { 19 "<physical_table_log_entry_address>" = "<physical log content>", 20 ... 21 } 22 ]] 23 output_delta_log = {}, 24 --[[ Used to return a mock response from "delta_client.get_table()" 25 { 26 "<n>" = {<initial log content>}, 27 "<n+1>" = {<initial log content>}, 28 } 29 ]] 30 table_logs_content = {}, 31 --[[ Used to validate the expected log content for a given table. 32 {<table_name1> = { 33 "<n>" = {<expected log content>}, 34 "<n+1>" = {<expected log content>}, 35 }}, 36 ... 37 ]] 38 table_expected_log = {}, 39 } 40 41 local function generate_physical_address(path) 42 return "s3://" .. path 43 end 44 45 package.loaded["lakefs/catalogexport/table_extractor"] = { 46 get_table_descriptor = function(_, _, _, table_src_path) 47 local t_name_yaml = pathlib.parse(table_src_path) 48 local t_name_yaml_base = t_name_yaml["base_name"] 49 assert(strings.has_suffix(t_name_yaml_base, ".yaml")) 50 local t_name = strings.split(t_name_yaml_base, ".")[1] 51 return { 52 name = t_name, 53 path = t_name 54 } 55 end 56 } 57 58 package.loaded.lakefs = { 59 stat_object = function(_, _, path) 60 local parsed_path = pathlib.parse(path) 61 local table_path_base = parsed_path["parent"] 62 if strings.has_suffix(table_path_base, "/") then 63 table_path_base = strings.split(table_path_base, "/")[1] 64 end 65 if not test_data.table_to_objects[table_path_base] then 66 test_data.table_to_objects[table_path_base] = {} 67 end 68 -- mark the given parquet file path under a specific table as requested. 69 test_data.table_to_objects[table_path_base][parsed_path["base_name"]] = true 70 return 200, json.marshal({ 71 physical_address = generate_physical_address(path) , 72 }) 73 end 74 } 75 76 local delta_export = require("lakefs/catalogexport/delta_exporter") 77 78 local function mock_delta_client(table_logs_content) 79 return { 80 get_table = function (_, _, path) 81 --[[ For the given table's path: 82 {"0" = <logical log content>, "1" = <logical log content>} 83 ]] 84 return table_logs_content[path], {description="Description for " .. path} 85 end 86 } 87 end 88 89 local function mock_object_writer(_, key, data) 90 test_data.output_delta_log[key] = data 91 end 92 93 local function assert_physical_address(delta_table_details, table_paths) 94 local ns = action.storage_namespace 95 local commit_id = action.commit_id 96 local table_export_prefix = utils.get_storage_uri_prefix(ns, commit_id, action) 97 98 for _, table_path in ipairs(table_paths) do 99 local table_name = pathlib.parse(table_path)["base_name"] 100 local table_details = delta_table_details[table_path] 101 if table_details == nil then 102 error("missing table location: " .. table_path) 103 end 104 local expected_location = pathlib.join("/", table_export_prefix, table_name) 105 if expected_location ~= table_details["path"] then 106 error(string.format("unexpected table location \"%s\".\nexpected: \"%s\"", table_details["path"], expected_location)) 107 end 108 end 109 end 110 111 local function assert_metadata(delta_table_details, table_paths) 112 for _, table_path in ipairs(table_paths) do 113 local table_details = delta_table_details[table_path] 114 if table_details == nil then 115 error("missing table location: " .. table_path) 116 end 117 local expected_description = "Description for " .. table_path 118 if expected_description ~= table_details["metadata"]["description"] then 119 error(string.format("unexpected table description \"%s\".\nexpected: \"%s\"", table_details["path"], expected_description)) 120 end 121 end 122 end 123 124 local function assert_lakefs_stats(table_names, content_paths) 125 for _, table_path in ipairs(table_names) do 126 local table = test_data.table_to_objects[table_path] 127 if not table then 128 error("missing lakeFS stat_object call for table path: " .. table_path .. "\n") 129 end 130 for _, data_path in ipairs(content_paths) do 131 if not table[data_path] then 132 error("missing lakeFS stat_object call for data path: " .. data_path .. " in table path: " .. table_path .. "\n") 133 end 134 end 135 end 136 end 137 138 local function assert_delta_log_content(delta_table_details, table_to_physical_content) 139 for table_path, table_details in pairs(delta_table_details) do 140 local table_loc = table_details["path"] 141 local table_name = pathlib.parse(table_path)["base_name"] 142 local table_loc_key = utils.parse_storage_uri(table_loc).key 143 local content_table = table_to_physical_content[table_name] 144 if not content_table then 145 error("unknown table " .. table_name) 146 end 147 for entry, content in pairs(content_table) do 148 local full_key = table_loc_key .. "/" .. entry 149 local output_content = test_data.output_delta_log[full_key] 150 if not output_content then 151 error("missing log file for path: " .. full_key .. "\n") 152 end 153 local str_content = "" 154 for _, row in ipairs(content) do 155 str_content = str_content .. row .. "\n" 156 end 157 if output_content ~= str_content then 158 error("expected content:\n" .. str_content .. "\n\nactual content:\n" .. output_content) 159 end 160 end 161 end 162 end 163 164 -- Test data 165 local data_paths = { "part-c000.snappy.parquet", "part-c001.snappy.parquet", "part-c002.snappy.parquet", "part-c003.snappy.parquet" } 166 local test_table_names = { "table1", "table2"} 167 168 for _, table_name in ipairs(test_table_names) do 169 test_data.table_logs_content[table_name] = { 170 ["_delta_log/00000000000000000000.json"] = { 171 "{\"commitInfo\":\"some info\"}", 172 "{\"add\": {\"path\":\"part-c000.snappy.parquet\"}}", 173 "{\"remove\": {\"path\":\"part-c001.snappy.parquet\"}}", 174 "{\"protocol\":\"the protocol\"}", 175 }, 176 ["_delta_log/00000000000000000001.json"] = { 177 "{\"metaData\":\"some metadata\"}", 178 "{\"add\": {\"path\":\"part-c002.snappy.parquet\"}}", 179 "{\"remove\": {\"path\":\"part-c003.snappy.parquet\"}}", 180 } 181 } 182 test_data.table_expected_log[table_name] = { 183 ["_delta_log/00000000000000000000.json"] = { 184 "{\"commitInfo\":\"some info\"}", 185 "{\"add\":{\"path\":\"" .. generate_physical_address(table_name .. "/part-c000.snappy.parquet") .. "\"}}", 186 "{\"remove\":{\"path\":\"" .. generate_physical_address(table_name .. "/part-c001.snappy.parquet") .. "\"}}", 187 "{\"protocol\":\"the protocol\"}", 188 }, 189 ["_delta_log/00000000000000000001.json"] = { 190 "{\"metaData\":\"some metadata\"}", 191 "{\"add\":{\"path\":\"" .. generate_physical_address(table_name .. "/part-c002.snappy.parquet") .. "\"}}", 192 "{\"remove\":{\"path\":\"" .. generate_physical_address(table_name .. "/part-c003.snappy.parquet") .. "\"}}", 193 } 194 } 195 end 196 197 198 -- Run Delta export test 199 local delta_table_details = delta_export.export_delta_log( 200 action, 201 test_table_names, 202 mock_object_writer, 203 mock_delta_client(test_data.table_logs_content), 204 "some_path" 205 ) 206 207 -- Test results 208 assert_lakefs_stats(test_table_names, data_paths) 209 assert_physical_address(delta_table_details, test_table_names) 210 assert_delta_log_content(delta_table_details, test_data.table_expected_log) 211 assert_metadata(delta_table_details, test_table_names)