github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/lua/lakefs/catalogexport/delta_exporter.lua (about) 1 local lakefs = require("lakefs") 2 local pathlib = require("path") 3 local json = require("encoding/json") 4 local utils = require("lakefs/catalogexport/internal") 5 local extractor = require("lakefs/catalogexport/table_extractor") 6 local strings = require("strings") 7 local url = require("net/url") 8 --[[ 9 delta_log_entry_key_generator returns a closure that returns a Delta Lake version key according to the Delta Lake 10 protocol: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#delta-log-entries 11 Example: 12 local gen = delta_log_entry_key_generator() 13 gen() -- 000000000000000000.json 14 gen() -- 000000000000000001.json 15 ]] 16 local function delta_log_entry_key_generator() 17 local current = 0 18 return function() 19 local delta_log_entry_length = 20 20 local key = tostring(current) 21 local padding_length = delta_log_entry_length - key:len() 22 local padded_key = "" 23 for _ = 1, padding_length do 24 padded_key = padded_key .. "0" 25 end 26 padded_key = padded_key .. key .. ".json" 27 current = current + 1 28 return padded_key 29 end 30 end 31 32 --[[ 33 action: 34 - repository_id 35 - commit_id 36 37 table_def_names: ["table1.yaml", "table2", ...] 38 39 write_object: function(bucket, key, data) 40 41 delta_client: 42 - get_table: function(repo, ref, prefix) 43 44 path_transformer: function(path) used for transforming path scheme (ex: Azure https to abfss) 45 46 ]] 47 local function export_delta_log(action, table_def_names, write_object, delta_client, table_descriptors_path, path_transformer) 48 local repo = action.repository_id 49 local commit_id = action.commit_id 50 if not commit_id then 51 error("missing commit id") 52 end 53 local ns = action.storage_namespace 54 if ns == nil then 55 error("failed getting storage namespace for repo " .. repo) 56 end 57 local response = {} 58 for _, table_name_yaml in ipairs(table_def_names) do 59 60 -- Get the table descriptor 61 local tny = table_name_yaml 62 if not strings.has_suffix(tny, ".yaml") then 63 tny = tny .. ".yaml" 64 end 65 local table_src_path = pathlib.join("/", table_descriptors_path, tny) 66 local table_descriptor = extractor.get_table_descriptor(lakefs, repo, commit_id, table_src_path) 67 local table_path = table_descriptor.path 68 if not table_path then 69 error("table path is required to proceed with Delta catalog export") 70 end 71 local table_name = table_descriptor.name 72 if not table_name then 73 error("table name is required to proceed with Delta catalog export") 74 end 75 76 -- Get Delta table 77 local t, metadata = delta_client.get_table(repo, commit_id, table_path) 78 local sortedKeys = utils.sortedKeys(t) 79 --[[ Pairs of (version, map of json content): 80 (1, 81 { 82 {"commitInfo":{"timestamp":1699276565259,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":9,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"4","numOutputBytes":"1353"},"engineInfo":"Apache-Spark/3.3.2 Delta-Lake/2.3.0","txnId":"eb6816ae-404f-4338-9e1a-2cb0a4626ab3"}} 83 {"add":{"path":"part-00000-a5a20e52-2b3d-440b-97a8-829fbc4a2804-c000.snappy.parquet","partitionValues":{},"size":1353,"modificationTime":1699276565000,"dataChange":true,"stats":"{\"numRecords\":4,\"minValues\":{\"Hylak_id\":18,\"Lake_name\":\" \",\"Country\":\"Malawi\",\"Depth_m\":3.0},\"maxValues\":{\"Hylak_id\":16138,\"Lake_name\":\"Malombe\",\"Country\":\"Malawi\",\"Depth_m\":706.0},\"nullCount\":{\"Hylak_id\":0,\"Lake_name\":0,\"Country\":0,\"Depth_m\":0}}"}} 84 {"remove":{"path":"part-00000-d660b401-ceec-415a-a791-e8d1c7599e3d-c000.snappy.parquet","deletionTimestamp":1699276565259,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":82491}} 85 } 86 ) 87 ]] 88 local table_log = {} 89 local keyGenerator = delta_log_entry_key_generator() 90 for _, key in ipairs(sortedKeys) do 91 local content = t[key] 92 local entry_log = {} 93 --[[ 94 An array of entries: 95 {"commitInfo":{"timestamp":1699276565259,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":9,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"4","numOutputBytes":"1353"},"engineInfo":"Apache-Spark/3.3.2 Delta-Lake/2.3.0","txnId":"eb6816ae-404f-4338-9e1a-2cb0a4626ab3"}}, 96 {"add":{"path":"part-00000-a5a20e52-2b3d-440b-97a8-829fbc4a2804-c000.snappy.parquet","partitionValues":{},"size":1353,"modificationTime":1699276565000,"dataChange":true,"stats":"{\"numRecords\":4,\"minValues\":{\"Hylak_id\":18,\"Lake_name\":\" \",\"Country\":\"Malawi\",\"Depth_m\":3.0},\"maxValues\":{\"Hylak_id\":16138,\"Lake_name\":\"Malombe\",\"Country\":\"Malawi\",\"Depth_m\":706.0},\"nullCount\":{\"Hylak_id\":0,\"Lake_name\":0,\"Country\":0,\"Depth_m\":0}}"}}, 97 {"remove":{"path":"part-00000-d660b401-ceec-415a-a791-e8d1c7599e3d-c000.snappy.parquet","deletionTimestamp":1699276565259,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":82491}} 98 ]] 99 for _, e in ipairs(content) do 100 local entry = json.unmarshal(e) 101 local p = "" 102 if entry.add ~= nil then 103 p = entry.add.path 104 elseif entry.remove ~= nil then 105 p = entry.remove.path 106 end 107 if p ~= "" then 108 local unescaped_path = url.query_unescape(p) 109 if not unescaped_path then 110 error("failed unescaping path: " .. p) 111 end 112 unescaped_path = pathlib.join("/", table_path, unescaped_path) 113 local code, obj = lakefs.stat_object(repo, commit_id, unescaped_path) 114 if code == 200 then 115 local obj_stat = json.unmarshal(obj) 116 --[[ 117 This code block handles escaping of the physical address path part 118 Since we don't want to escape the entire URL (i.e. schema, host), we parse the url and rebuild it. 119 Building the url will then handle any escaping needed on the relevant parts. 120 ]] 121 local u = url.parse(obj_stat["physical_address"]) 122 local physical_path = url.build_url(u["scheme"], u["host"], u["path"]) 123 if path_transformer ~= nil then 124 physical_path = path_transformer(physical_path) 125 end 126 if entry.add ~= nil then 127 entry.add.path = physical_path 128 elseif entry.remove ~= nil then 129 entry.remove.path = physical_path 130 end 131 else 132 error("failed stat_object with code: " .. tostring(code) .. ", and path: " .. unescaped_path) 133 end 134 end 135 local entry_m = json.marshal(entry) 136 table.insert(entry_log, entry_m) 137 end 138 table_log[keyGenerator()] = entry_log 139 end 140 141 local table_export_prefix = utils.get_storage_uri_prefix(ns, commit_id, action) 142 local table_physical_path = pathlib.join("/", table_export_prefix, table_name) 143 local table_log_physical_path = pathlib.join("/", table_physical_path, "_delta_log") 144 145 -- Upload the log to this physical_address 146 local storage_props = utils.parse_storage_uri(table_log_physical_path) 147 --[[ 148 table_log: 149 { 150 <version1>.json : [ 151 {"commitInfo":...}, {"add": ...}, {"remove": ...},... 152 ], 153 <version2>.json : [ 154 {"commitInfo":...}, {"add": ...}, {"remove": ...},... 155 ],... 156 } 157 ]] 158 for entry_version, table_entry in pairs(table_log) do 159 local table_entry_string = "" 160 for _, content_entry in ipairs(table_entry) do 161 table_entry_string = table_entry_string .. content_entry 162 end 163 local version_key = storage_props.key .. "/" .. entry_version 164 write_object(storage_props.bucket, version_key, table_entry_string) 165 end 166 -- Save physical path using the path_transformer if exists 167 if path_transformer ~= nil then 168 table_physical_path = path_transformer(table_physical_path) 169 end 170 local table_val = { 171 path=table_physical_path, 172 metadata=metadata, 173 } 174 response[table_name_yaml] = table_val 175 end 176 return response 177 end 178 179 return { 180 export_delta_log = export_delta_log, 181 }