github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/examples/hooks/gcsfuse_symlink_exporter.lua (about)

     1  --[[
     2  GCSFuse Symlink Exporter
     3  
     4  Export gcsfuse-compatible symlinks from a path in a lakeFS repository.
     5  gcsfuse (https://github.com/GoogleCloudPlatform/gcsfuse) is used by managed Google Cloud services such as Vertex AI.
     6  
     7  Symlinks are supported by writing an empty (0-byte) object with a `gcsfuse_symlink_target` metadata header, with the target
     8    being the metadata header value.
     9  
    10  Note: When mounting using gcsfuse, the --implicit-dirs flag must be used for lakeFS data to appear.
    11  
    12  Args:
    13   - prefix (string): path in lakeFS to export as symlinks
    14   - destination (string): where in gcs should these symlinks be written to
    15   - mount.from (string): will be stripped from the physical address of objects when writing the symlink
    16   - mount.to (string): will be prepended to the physical address of objects when writing the symlink
    17   - write_current_marker (bool, default = true): if set to false, don't write a "current" symlink that points to the latest commit
    18   - gcs_credentials_json_string (string): Google Cloud credentials to use when writing to symlink destination
    19  
    20  
    21  Example hook declaration: (_lakefs_actions/export_images.yaml):
    22  name: export_images
    23  on:
    24    post-commit:
    25    branches:
    26      - main
    27    hooks:
    28      - id: gcsfuse_export_images
    29        type: lua
    30        properties:
    31          script_path: scripts/export_gcs_fuse.lua
    32          args:
    33            prefix: "datasets/images/"
    34            destination: "gs://my-bucket/exports/my-repo/"
    35            mount:
    36              from: "gs://my-bucket/repos/my-repo/"
    37              to: "/gcs/my-bucket/repos/my-repo/"
    38            gcs_credentials_json_string: |
    39              {
    40                "client_id": "...",
    41                "client_secret": "...",
    42                "refresh_token": "...",
    43                "type": "..."
    44              }
    45  ]]
    46  
    47  gcloud = require("gcloud")
    48  lakefs = require("lakefs")
    49  path = require("path")
    50  
    51  -- initialize client
    52  print("initializing GS client")
    53  gs = gcloud.gs_client(args.gcs_credentials_json_string)
    54  
    55  -- get the current commit ID and ref
    56  local current_commit = action.commit_id
    57  tag_events = {  ["pre-create-tag"] = true,  ["post-create-tag"] = true }
    58  branch_events = {  ["pre-create-branch"] = true,  ["post-create-branch"] = true, ["post-commit"] = true, ["post-merge"] = true }
    59  local ref
    60  local ref_type
    61  if tag_events[action.event_type] then
    62      ref = action.tag_id
    63      ref_type = "tags"
    64  elseif branch_events[action.event_type] then
    65      ref = action.branch_id
    66      ref_type = "branches"
    67  else
    68      error("unsupported event type: " .. action.event_type)
    69  end
    70  print("using ref_type = " .. ref_type .. ", ref = " .. ref)
    71  
    72  local total = 0
    73  local after = ""
    74  local has_more = true
    75  local out = path.join("/", args.destination, "commits", current_commit)
    76  
    77  while has_more do
    78      local code, resp = lakefs.list_objects(action.repository_id, current_commit, after, args.prefix, "") -- without delimiter
    79      if code ~= 200 then
    80          error("could not list path: " .. args.prefix .. ", error: " .. resp.message)
    81      end
    82      for _, entry in ipairs(resp.results) do
    83          total = total + 1
    84          gs.write_fuse_symlink(
    85                  entry.physical_address,
    86                  path.join("/", out, entry.path),
    87                  {["from"] = args.mount.from, ["to"] = args.mount.to})
    88      end
    89      -- pagination
    90      has_more = resp.pagination.has_more
    91      after = resp.pagination.next_offset
    92  end
    93  
    94  print("-- done writing object symlinks (" .. total .. " total symlinks created) --")
    95  
    96  if args["write_current_marker"] ~= false then
    97      local marker = path.join("/", args.destination, ref_type, ref)
    98      gs.write_fuse_symlink("../commits/" .. current_commit, marker, {})
    99  end