github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/testdata/lua/catalogexport_delta.lua (about)

     1  local pathlib = require("path")
     2  local json = require("encoding/json")
     3  local utils = require("lakefs/catalogexport/internal")
     4  local strings = require("strings")
     5  
     6  
     7  local test_data = {
     8      --[[
     9          Used to mark the objects to which a stat_object request was issued.
    10          {
    11              "table_path1": { "file1.parquet" = true, "file2.parquet" = true, ...},
    12              "table_path2": { "file1.parquet" = true, "file2.parquet" = true, ...}
    13          }
    14      ]]
    15      table_to_objects = {},
    16      --[[
    17          Used to validate the expected Delta Log content
    18          {
    19              "<physical_table_log_entry_address>" = "<physical log content>",
    20              ...
    21          }
    22      ]]
    23      output_delta_log = {},
    24      --[[ Used to return a mock response from "delta_client.get_table()"
    25          {
    26             "<n>" = {<initial log content>},
    27             "<n+1>" = {<initial log content>},
    28          }
    29      ]]
    30      table_logs_content = {},
    31      --[[ Used to validate the expected log content for a given table.
    32          {<table_name1> = {
    33             "<n>" = {<expected log content>},
    34             "<n+1>" = {<expected log content>},
    35          }},
    36          ...
    37      ]]
    38      table_expected_log = {},
    39  }
    40  
    41  local function generate_physical_address(path)
    42      return "s3://" .. path
    43  end
    44  
    45  package.loaded["lakefs/catalogexport/table_extractor"] = {
    46      get_table_descriptor = function(_, _, _, table_src_path)
    47          local t_name_yaml = pathlib.parse(table_src_path)
    48          local t_name_yaml_base = t_name_yaml["base_name"]
    49          assert(strings.has_suffix(t_name_yaml_base, ".yaml"))
    50          local t_name = strings.split(t_name_yaml_base, ".")[1]
    51          return {
    52              name = t_name,
    53              path = t_name
    54          }
    55      end
    56  }
    57  
    58  package.loaded.lakefs = {
    59      stat_object = function(_, _, path)
    60          local parsed_path = pathlib.parse(path)
    61          local table_path_base = parsed_path["parent"]
    62          if strings.has_suffix(table_path_base, "/") then
    63              table_path_base = strings.split(table_path_base, "/")[1]
    64          end
    65          if not test_data.table_to_objects[table_path_base] then
    66              test_data.table_to_objects[table_path_base] = {}
    67          end
    68          -- mark the given parquet file path under a specific table as requested.
    69          test_data.table_to_objects[table_path_base][parsed_path["base_name"]] = true
    70          return 200, json.marshal({
    71              physical_address = generate_physical_address(path) ,
    72          })
    73      end
    74  }
    75  
    76  local delta_export = require("lakefs/catalogexport/delta_exporter")
    77  
    78  local function mock_delta_client(table_logs_content)
    79      return {
    80          get_table = function (_, _, path)
    81              --[[ For the given table's path:
    82                  {"0" = <logical log content>, "1" = <logical log content>}
    83              ]]
    84              return table_logs_content[path], {description="Description for " .. path}
    85          end
    86      }
    87  end
    88  
    89  local function mock_object_writer(_, key, data)
    90      test_data.output_delta_log[key] = data
    91  end
    92  
    93  local function assert_physical_address(delta_table_details, table_paths)
    94      local ns = action.storage_namespace
    95      local commit_id = action.commit_id
    96      local table_export_prefix = utils.get_storage_uri_prefix(ns, commit_id, action)
    97  
    98      for _, table_path in ipairs(table_paths) do
    99          local table_name = pathlib.parse(table_path)["base_name"]
   100          local table_details = delta_table_details[table_path]
   101          if table_details == nil then
   102              error("missing table location: " .. table_path)
   103          end
   104          local expected_location = pathlib.join("/", table_export_prefix, table_name)
   105          if expected_location ~= table_details["path"] then
   106              error(string.format("unexpected table location \"%s\".\nexpected: \"%s\"", table_details["path"], expected_location))
   107          end
   108      end
   109  end
   110  
   111  local function assert_metadata(delta_table_details, table_paths)
   112      for _, table_path in ipairs(table_paths) do
   113          local table_details = delta_table_details[table_path]
   114          if table_details == nil then
   115              error("missing table location: " .. table_path)
   116          end
   117          local expected_description = "Description for " .. table_path
   118          if expected_description ~= table_details["metadata"]["description"] then
   119              error(string.format("unexpected table description \"%s\".\nexpected: \"%s\"", table_details["path"], expected_description))
   120          end
   121      end
   122  end
   123  
   124  local function assert_lakefs_stats(table_names, content_paths)
   125      for _, table_path in ipairs(table_names) do
   126          local table = test_data.table_to_objects[table_path]
   127          if not table then
   128              error("missing lakeFS stat_object call for table path: " .. table_path .. "\n")
   129          end
   130          for _, data_path in ipairs(content_paths) do
   131              if not table[data_path] then
   132                  error("missing lakeFS stat_object call for data path: " .. data_path .. " in table path: " .. table_path .. "\n")
   133              end
   134          end
   135      end
   136  end
   137  
   138  local function assert_delta_log_content(delta_table_details, table_to_physical_content)
   139      for table_path, table_details in pairs(delta_table_details) do
   140          local table_loc = table_details["path"]
   141          local table_name = pathlib.parse(table_path)["base_name"]
   142          local table_loc_key = utils.parse_storage_uri(table_loc).key
   143          local content_table = table_to_physical_content[table_name]
   144          if not content_table then
   145              error("unknown table " .. table_name)
   146          end
   147          for entry, content in pairs(content_table) do
   148              local full_key = table_loc_key .. "/" .. entry
   149              local output_content = test_data.output_delta_log[full_key]
   150              if not output_content then
   151                  error("missing log file for path: " .. full_key .. "\n")
   152              end
   153              local str_content = ""
   154              for _, row in ipairs(content) do
   155                  str_content = str_content .. row .. "\n"
   156              end
   157              if output_content ~= str_content then
   158                  error("expected content:\n" .. str_content .. "\n\nactual content:\n" .. output_content)
   159              end
   160          end
   161      end
   162  end
   163  
   164  -- Test data
   165  local data_paths = { "part-c000.snappy.parquet", "part-c001.snappy.parquet", "part-c002.snappy.parquet", "part-c003.snappy.parquet" }
   166  local test_table_names = { "table1", "table2"}
   167  
   168  for _, table_name in ipairs(test_table_names) do
   169      test_data.table_logs_content[table_name] = {
   170          ["_delta_log/00000000000000000000.json"] = {
   171              "{\"commitInfo\":\"some info\"}",
   172              "{\"add\": {\"path\":\"part-c000.snappy.parquet\"}}",
   173              "{\"remove\": {\"path\":\"part-c001.snappy.parquet\"}}",
   174              "{\"protocol\":\"the protocol\"}",
   175          },
   176          ["_delta_log/00000000000000000001.json"] = {
   177              "{\"metaData\":\"some metadata\"}",
   178              "{\"add\": {\"path\":\"part-c002.snappy.parquet\"}}",
   179              "{\"remove\": {\"path\":\"part-c003.snappy.parquet\"}}",
   180          }
   181      }
   182      test_data.table_expected_log[table_name] = {
   183          ["_delta_log/00000000000000000000.json"] = {
   184              "{\"commitInfo\":\"some info\"}",
   185              "{\"add\":{\"path\":\"" .. generate_physical_address(table_name .. "/part-c000.snappy.parquet") .. "\"}}",
   186              "{\"remove\":{\"path\":\"" .. generate_physical_address(table_name .. "/part-c001.snappy.parquet") .. "\"}}",
   187              "{\"protocol\":\"the protocol\"}",
   188          },
   189          ["_delta_log/00000000000000000001.json"] = {
   190              "{\"metaData\":\"some metadata\"}",
   191              "{\"add\":{\"path\":\"" .. generate_physical_address(table_name .. "/part-c002.snappy.parquet") .. "\"}}",
   192              "{\"remove\":{\"path\":\"" .. generate_physical_address(table_name .. "/part-c003.snappy.parquet") .. "\"}}",
   193          }
   194      }
   195  end
   196  
   197  
   198  -- Run Delta export test
   199  local delta_table_details = delta_export.export_delta_log(
   200          action,
   201          test_table_names,
   202          mock_object_writer,
   203          mock_delta_client(test_data.table_logs_content),
   204          "some_path"
   205  )
   206  
   207  -- Test results
   208  assert_lakefs_stats(test_table_names, data_paths)
   209  assert_physical_address(delta_table_details, test_table_names)
   210  assert_delta_log_content(delta_table_details, test_data.table_expected_log)
   211  assert_metadata(delta_table_details, test_table_names)