github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/lua/lakefs/catalogexport/glue_exporter.lua (about)

     1  local pathlib = require("path")
     2  local json = require("encoding/json")
     3  local lakefs = require("lakefs")
     4  local extractor = require("lakefs/catalogexport/table_extractor")
     5  local utils = require("lakefs/catalogexport/internal")
     6  local sym_exporter = require("lakefs/catalogexport/symlink_exporter")
     7  
     8  --[[
     9      Generate glue table name
    10      @descriptor(Table): object from (i.e _lakefs_tables/my_table.yaml)
    11      @action_info(Table): the global action object
    12  ]]
    13  local function get_full_table_name(descriptor, action_info)
    14      local commit_id = action_info.commit_id
    15      local repo_id = action_info.repository_id
    16      local branch_or_tag = utils.ref_from_branch_or_tag(action_info)
    17      local sha = utils.short_digest(commit_id)
    18      return string.format("%s_%s_%s_%s", descriptor.name, repo_id, branch_or_tag, sha)
    19  end
    20  
    21  -- map hive to glue types
    22  local typesMapping = {
    23      integer = "int"
    24  }
    25  
    26  -- helper function to convert hive col to part of glue create table input
    27  local function hive_col_to_glue(col)
    28      return {
    29          Name = col.name,
    30          Type = typesMapping[col.type] or col.type,
    31          Comment = col.comment,
    32          Parameters = col.parameters
    33      }
    34  end
    35  
    36  -- Create list of partitions for Glue input from a Hive descriptor
    37  local function hive_partitions_to_glue_input(descriptor)
    38      local partitions = {}
    39      local cols = descriptor.schema.fields or {}
    40      -- columns list to map by name
    41      for _, c in ipairs(cols) do
    42          cols[c.name] = c
    43      end
    44      -- iterate partitions order and find them in the fields, the order determines the path in storage
    45      for _, part_key in ipairs(descriptor.partition_columns) do
    46          local col = cols[part_key]
    47          if col == nil then
    48              error(string.format("partition name `%s` not found in table `%s`", part_key, descriptor.name))
    49          end
    50          table.insert(partitions, hive_col_to_glue(col))
    51      end
    52      return partitions
    53  end
    54  
    55  -- Create list of columns for Glue excluding partitions
    56  local function hive_columns_to_glue_input(descriptor)
    57      -- create set of partition names since they must not appear in the columns input in glue
    58      local partition_names = {}
    59      for _, p in ipairs(descriptor.partition_columns) do
    60          partition_names[p] = true
    61      end
    62      -- create columns as inputs for glue
    63      local columns = {}
    64      local cols = descriptor.schema.fields or {}
    65      for _, col in ipairs(cols) do
    66          if not partition_names[col.name] then -- not a partition
    67              table.insert(columns, hive_col_to_glue(col))
    68          end
    69      end
    70      return columns
    71  end
    72  
    73  -- default location value (e.g root location of either partitions or flat symlink.txt file)
    74  local function get_table_location(storage_base_prefix, descriptor, action_info)
    75      local commit_id = action_info.commit_id
    76      local export_base_uri = utils.get_storage_uri_prefix(storage_base_prefix, commit_id, action_info)
    77      return pathlib.join("/", export_base_uri, descriptor.name)
    78  end
    79  
    80  -- create a standard AWS Glue table input (i.e not Apache Iceberg), add input values to base input and configure the rest
    81  local function build_glue_create_table_input(base_input, descriptor, symlink_location, columns, partitions, action_info,
    82      options)
    83      local input = utils.deepcopy(base_input)
    84      local opts = options or {}
    85      input.Name = opts.table_name or get_full_table_name(descriptor, action_info)
    86      input.PartitionKeys = array(partitions)
    87      input.TableType = "EXTERNAL_TABLE"
    88      input.StorageDescriptor.Columns = array(columns)
    89      input.StorageDescriptor.Location = symlink_location
    90      return input
    91  end
    92  
    93  --[[
    94      create a standard glue table in glue catalog
    95      @glue: AWS glue client
    96      @db(string): glue database name
    97      @table_src_path(string): path to table spec (i.e _lakefs_tables/my_table.yaml)
    98      @create_table_input(Table): struct mapping to table_input in AWS https://docs.aws.amazon.com/glue/latest/webapi/API_CreateTable.html#API_CreateTable_RequestSyntax
    99      should contain inputs describing the data format (i.e InputFormat, OutputFormat, SerdeInfo) since the exporter is agnostic to this.
   100      by default this function will configure table location and schema.
   101      @action_info(Table): the global action object
   102      @options:
   103      - table_name(string): override default glue table name
   104      - debug(boolean)
   105      - export_base_uri(string): override the default prefix in S3 for symlink location i.e s3://other-bucket/path/
   106  ]]
   107  local function export_glue(glue, db, table_src_path, create_table_input, action_info, options)
   108      local opts = options or {}
   109      local repo_id = action_info.repository_id
   110      local commit_id = action_info.commit_id
   111  
   112      -- get table desctiptor from _lakefs_tables/
   113      local descriptor = extractor.get_table_descriptor(lakefs, repo_id, commit_id, table_src_path)
   114  
   115      -- get table symlink location uri
   116      local base_prefix = opts.export_base_uri or action_info.storage_namespace
   117      local symlink_location = get_table_location(base_prefix, descriptor, action_info)
   118  
   119      -- parse Hive table
   120      local columns = {}
   121      local partitions = {}
   122      if descriptor.type == "hive" then
   123          -- convert hive cols/partitions to glue
   124          partitions = hive_partitions_to_glue_input(descriptor)
   125          columns = hive_columns_to_glue_input(descriptor)
   126      else
   127          error("table " .. descriptor.type .. " in path " .. table_src_path .. " not supported")
   128      end
   129  
   130      -- finallize create glue table input
   131      local table_input = build_glue_create_table_input(create_table_input, descriptor, symlink_location, columns,
   132          partitions, action_info, opts)
   133  
   134      -- create table
   135      local json_input = json.marshal(table_input)
   136      if opts.debug then
   137          print("Creating Glue Table - input:", json_input)
   138      end
   139      glue.create_table(db, json_input)
   140      return {
   141          table_input = table_input
   142      }
   143  end
   144  
   145  return {
   146      get_full_table_name=get_full_table_name,
   147      export_glue = export_glue
   148  }