github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/lua/lakefs/catalogexport/delta_exporter.lua (about)

     1  local lakefs = require("lakefs")
     2  local pathlib = require("path")
     3  local json = require("encoding/json")
     4  local utils = require("lakefs/catalogexport/internal")
     5  local extractor = require("lakefs/catalogexport/table_extractor")
     6  local strings = require("strings")
     7  local url = require("net/url")
     8  --[[
     9      delta_log_entry_key_generator returns a closure that returns a Delta Lake version key according to the Delta Lake
    10      protocol: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#delta-log-entries
    11      Example:
    12          local gen = delta_log_entry_key_generator()
    13          gen() -- 000000000000000000.json
    14          gen() -- 000000000000000001.json
    15  ]]
    16  local function delta_log_entry_key_generator()
    17      local current = 0
    18      return function()
    19          local delta_log_entry_length = 20
    20          local key = tostring(current)
    21          local padding_length = delta_log_entry_length - key:len()
    22          local padded_key = ""
    23          for _ = 1, padding_length do
    24              padded_key = padded_key .. "0"
    25          end
    26          padded_key = padded_key .. key .. ".json"
    27          current = current + 1
    28          return padded_key
    29      end
    30  end
    31  
    32  --[[
    33      action:
    34          - repository_id
    35          - commit_id
    36  
    37     table_def_names: ["table1.yaml", "table2", ...]
    38  
    39      write_object: function(bucket, key, data)
    40  
    41      delta_client:
    42          - get_table: function(repo, ref, prefix)
    43      
    44      path_transformer: function(path) used for transforming path scheme (ex: Azure https to abfss)
    45  
    46  ]]
    47  local function export_delta_log(action, table_def_names, write_object, delta_client, table_descriptors_path, path_transformer)
    48      local repo = action.repository_id
    49      local commit_id = action.commit_id
    50      if not commit_id then
    51          error("missing commit id")
    52      end
    53      local ns = action.storage_namespace
    54      if ns == nil then
    55          error("failed getting storage namespace for repo " .. repo)
    56      end
    57      local response = {}
    58      for _, table_name_yaml in ipairs(table_def_names) do
    59  
    60          -- Get the table descriptor
    61          local tny  = table_name_yaml
    62          if not strings.has_suffix(tny, ".yaml") then
    63              tny = tny .. ".yaml"
    64          end
    65          local table_src_path = pathlib.join("/", table_descriptors_path, tny)
    66          local table_descriptor = extractor.get_table_descriptor(lakefs, repo, commit_id, table_src_path)
    67          local table_path = table_descriptor.path
    68          if not table_path then
    69              error("table path is required to proceed with Delta catalog export")
    70          end
    71          local table_name = table_descriptor.name
    72          if not table_name then
    73              error("table name is required to proceed with Delta catalog export")
    74          end
    75  
    76          -- Get Delta table
    77          local t, metadata = delta_client.get_table(repo, commit_id, table_path)
    78          local sortedKeys = utils.sortedKeys(t)
    79          --[[ Pairs of (version, map of json content):
    80                  (1,
    81                  {
    82                      {"commitInfo":{"timestamp":1699276565259,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":9,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"4","numOutputBytes":"1353"},"engineInfo":"Apache-Spark/3.3.2 Delta-Lake/2.3.0","txnId":"eb6816ae-404f-4338-9e1a-2cb0a4626ab3"}}
    83                      {"add":{"path":"part-00000-a5a20e52-2b3d-440b-97a8-829fbc4a2804-c000.snappy.parquet","partitionValues":{},"size":1353,"modificationTime":1699276565000,"dataChange":true,"stats":"{\"numRecords\":4,\"minValues\":{\"Hylak_id\":18,\"Lake_name\":\" \",\"Country\":\"Malawi\",\"Depth_m\":3.0},\"maxValues\":{\"Hylak_id\":16138,\"Lake_name\":\"Malombe\",\"Country\":\"Malawi\",\"Depth_m\":706.0},\"nullCount\":{\"Hylak_id\":0,\"Lake_name\":0,\"Country\":0,\"Depth_m\":0}}"}}
    84                      {"remove":{"path":"part-00000-d660b401-ceec-415a-a791-e8d1c7599e3d-c000.snappy.parquet","deletionTimestamp":1699276565259,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":82491}}
    85                  }
    86                  )
    87          ]]
    88          local table_log = {}
    89          local keyGenerator = delta_log_entry_key_generator()
    90          for _, key in ipairs(sortedKeys) do
    91              local content = t[key]
    92              local entry_log = {}
    93              --[[
    94                  An array of entries:
    95                      {"commitInfo":{"timestamp":1699276565259,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":9,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"4","numOutputBytes":"1353"},"engineInfo":"Apache-Spark/3.3.2 Delta-Lake/2.3.0","txnId":"eb6816ae-404f-4338-9e1a-2cb0a4626ab3"}},
    96                      {"add":{"path":"part-00000-a5a20e52-2b3d-440b-97a8-829fbc4a2804-c000.snappy.parquet","partitionValues":{},"size":1353,"modificationTime":1699276565000,"dataChange":true,"stats":"{\"numRecords\":4,\"minValues\":{\"Hylak_id\":18,\"Lake_name\":\" \",\"Country\":\"Malawi\",\"Depth_m\":3.0},\"maxValues\":{\"Hylak_id\":16138,\"Lake_name\":\"Malombe\",\"Country\":\"Malawi\",\"Depth_m\":706.0},\"nullCount\":{\"Hylak_id\":0,\"Lake_name\":0,\"Country\":0,\"Depth_m\":0}}"}},
    97                      {"remove":{"path":"part-00000-d660b401-ceec-415a-a791-e8d1c7599e3d-c000.snappy.parquet","deletionTimestamp":1699276565259,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":82491}}
    98              ]]
    99              for _, e in ipairs(content) do
   100                  local entry = json.unmarshal(e)
   101                  local p = ""
   102                  if entry.add ~= nil then
   103                      p = entry.add.path
   104                  elseif entry.remove ~= nil then
   105                      p = entry.remove.path
   106                  end
   107                  if p ~= "" then
   108                      local unescaped_path = url.query_unescape(p)
   109                      if not unescaped_path then
   110                          error("failed unescaping path: " .. p)
   111                      end
   112                      unescaped_path = pathlib.join("/", table_path, unescaped_path)
   113                      local code, obj = lakefs.stat_object(repo, commit_id, unescaped_path)
   114                      if code == 200 then
   115                          local obj_stat = json.unmarshal(obj)
   116                          --[[
   117                          This code block handles escaping of the physical address path part
   118                          Since we don't want to escape the entire URL (i.e. schema, host), we parse the url and rebuild it.
   119                          Building the url will then handle any escaping needed on the relevant parts.
   120                          ]]
   121                          local u = url.parse(obj_stat["physical_address"])
   122                          local physical_path = url.build_url(u["scheme"], u["host"], u["path"])
   123                          if path_transformer ~= nil then
   124                              physical_path = path_transformer(physical_path)
   125                          end
   126                          if entry.add ~= nil then
   127                              entry.add.path = physical_path
   128                          elseif entry.remove ~= nil then
   129                              entry.remove.path = physical_path
   130                          end
   131                      else
   132                          error("failed stat_object with code: " .. tostring(code) .. ", and path: " .. unescaped_path)
   133                      end
   134                  end
   135                  local entry_m = json.marshal(entry)
   136                  table.insert(entry_log, entry_m)
   137              end
   138              table_log[keyGenerator()] = entry_log
   139          end
   140  
   141          local table_export_prefix = utils.get_storage_uri_prefix(ns, commit_id, action)
   142          local table_physical_path = pathlib.join("/", table_export_prefix, table_name)
   143          local table_log_physical_path = pathlib.join("/", table_physical_path, "_delta_log")
   144  
   145          -- Upload the log to this physical_address
   146          local storage_props = utils.parse_storage_uri(table_log_physical_path)
   147          --[[
   148              table_log:
   149                  {
   150                      <version1>.json : [
   151                          {"commitInfo":...}, {"add": ...}, {"remove": ...},...
   152                      ],
   153                      <version2>.json : [
   154                          {"commitInfo":...}, {"add": ...}, {"remove": ...},...
   155                      ],...
   156                  }
   157          ]]
   158          for entry_version, table_entry in pairs(table_log) do
   159              local table_entry_string = ""
   160              for _, content_entry in ipairs(table_entry) do
   161                  table_entry_string = table_entry_string .. content_entry
   162              end
   163              local version_key = storage_props.key .. "/" .. entry_version
   164              write_object(storage_props.bucket, version_key, table_entry_string)
   165          end
   166          -- Save physical path using the path_transformer if exists
   167          if path_transformer ~= nil then
   168              table_physical_path = path_transformer(table_physical_path)
   169          end
   170          local table_val = {
   171              path=table_physical_path,
   172              metadata=metadata,
   173          }
   174          response[table_name_yaml] = table_val
   175      end
   176      return response
   177  end
   178  
   179  return {
   180      export_delta_log = export_delta_log,
   181  }