github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/lua/lakefs/catalogexport/glue_exporter.lua (about) 1 local pathlib = require("path") 2 local json = require("encoding/json") 3 local lakefs = require("lakefs") 4 local extractor = require("lakefs/catalogexport/table_extractor") 5 local utils = require("lakefs/catalogexport/internal") 6 local sym_exporter = require("lakefs/catalogexport/symlink_exporter") 7 8 --[[ 9 Generate glue table name 10 @descriptor(Table): object from (i.e _lakefs_tables/my_table.yaml) 11 @action_info(Table): the global action object 12 ]] 13 local function get_full_table_name(descriptor, action_info) 14 local commit_id = action_info.commit_id 15 local repo_id = action_info.repository_id 16 local branch_or_tag = utils.ref_from_branch_or_tag(action_info) 17 local sha = utils.short_digest(commit_id) 18 return string.format("%s_%s_%s_%s", descriptor.name, repo_id, branch_or_tag, sha) 19 end 20 21 -- map hive to glue types 22 local typesMapping = { 23 integer = "int" 24 } 25 26 -- helper function to convert hive col to part of glue create table input 27 local function hive_col_to_glue(col) 28 return { 29 Name = col.name, 30 Type = typesMapping[col.type] or col.type, 31 Comment = col.comment, 32 Parameters = col.parameters 33 } 34 end 35 36 -- Create list of partitions for Glue input from a Hive descriptor 37 local function hive_partitions_to_glue_input(descriptor) 38 local partitions = {} 39 local cols = descriptor.schema.fields or {} 40 -- columns list to map by name 41 for _, c in ipairs(cols) do 42 cols[c.name] = c 43 end 44 -- iterate partitions order and find them in the fields, the order determines the path in storage 45 for _, part_key in ipairs(descriptor.partition_columns) do 46 local col = cols[part_key] 47 if col == nil then 48 error(string.format("partition name `%s` not found in table `%s`", part_key, descriptor.name)) 49 end 50 table.insert(partitions, hive_col_to_glue(col)) 51 end 52 return partitions 53 end 54 55 -- Create list of columns for Glue excluding partitions 56 local function hive_columns_to_glue_input(descriptor) 57 -- create set of partition names since they must not appear in the columns input in glue 58 local partition_names = {} 59 for _, p in ipairs(descriptor.partition_columns) do 60 partition_names[p] = true 61 end 62 -- create columns as inputs for glue 63 local columns = {} 64 local cols = descriptor.schema.fields or {} 65 for _, col in ipairs(cols) do 66 if not partition_names[col.name] then -- not a partition 67 table.insert(columns, hive_col_to_glue(col)) 68 end 69 end 70 return columns 71 end 72 73 -- default location value (e.g root location of either partitions or flat symlink.txt file) 74 local function get_table_location(storage_base_prefix, descriptor, action_info) 75 local commit_id = action_info.commit_id 76 local export_base_uri = utils.get_storage_uri_prefix(storage_base_prefix, commit_id, action_info) 77 return pathlib.join("/", export_base_uri, descriptor.name) 78 end 79 80 -- create a standard AWS Glue table input (i.e not Apache Iceberg), add input values to base input and configure the rest 81 local function build_glue_create_table_input(base_input, descriptor, symlink_location, columns, partitions, action_info, 82 options) 83 local input = utils.deepcopy(base_input) 84 local opts = options or {} 85 input.Name = opts.table_name or get_full_table_name(descriptor, action_info) 86 input.PartitionKeys = array(partitions) 87 input.TableType = "EXTERNAL_TABLE" 88 input.StorageDescriptor.Columns = array(columns) 89 input.StorageDescriptor.Location = symlink_location 90 return input 91 end 92 93 --[[ 94 create a standard glue table in glue catalog 95 @glue: AWS glue client 96 @db(string): glue database name 97 @table_src_path(string): path to table spec (i.e _lakefs_tables/my_table.yaml) 98 @create_table_input(Table): struct mapping to table_input in AWS https://docs.aws.amazon.com/glue/latest/webapi/API_CreateTable.html#API_CreateTable_RequestSyntax 99 should contain inputs describing the data format (i.e InputFormat, OutputFormat, SerdeInfo) since the exporter is agnostic to this. 100 by default this function will configure table location and schema. 101 @action_info(Table): the global action object 102 @options: 103 - table_name(string): override default glue table name 104 - debug(boolean) 105 - export_base_uri(string): override the default prefix in S3 for symlink location i.e s3://other-bucket/path/ 106 ]] 107 local function export_glue(glue, db, table_src_path, create_table_input, action_info, options) 108 local opts = options or {} 109 local repo_id = action_info.repository_id 110 local commit_id = action_info.commit_id 111 112 -- get table desctiptor from _lakefs_tables/ 113 local descriptor = extractor.get_table_descriptor(lakefs, repo_id, commit_id, table_src_path) 114 115 -- get table symlink location uri 116 local base_prefix = opts.export_base_uri or action_info.storage_namespace 117 local symlink_location = get_table_location(base_prefix, descriptor, action_info) 118 119 -- parse Hive table 120 local columns = {} 121 local partitions = {} 122 if descriptor.type == "hive" then 123 -- convert hive cols/partitions to glue 124 partitions = hive_partitions_to_glue_input(descriptor) 125 columns = hive_columns_to_glue_input(descriptor) 126 else 127 error("table " .. descriptor.type .. " in path " .. table_src_path .. " not supported") 128 end 129 130 -- finallize create glue table input 131 local table_input = build_glue_create_table_input(create_table_input, descriptor, symlink_location, columns, 132 partitions, action_info, opts) 133 134 -- create table 135 local json_input = json.marshal(table_input) 136 if opts.debug then 137 print("Creating Glue Table - input:", json_input) 138 end 139 glue.create_table(db, json_input) 140 return { 141 table_input = table_input 142 } 143 end 144 145 return { 146 get_full_table_name=get_full_table_name, 147 export_glue = export_glue 148 }