github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/examples/hooks/gcsfuse_symlink_exporter.lua (about) 1 --[[ 2 GCSFuse Symlink Exporter 3 4 Export gcsfuse-compatible symlinks from a path in a lakeFS repository. 5 gcsfuse (https://github.com/GoogleCloudPlatform/gcsfuse) is used by managed Google Cloud services such as Vertex AI. 6 7 Symlinks are supported by writing an empty (0-byte) object with a `gcsfuse_symlink_target` metadata header, with the target 8 being the metadata header value. 9 10 Note: When mounting using gcsfuse, the --implicit-dirs flag must be used for lakeFS data to appear. 11 12 Args: 13 - prefix (string): path in lakeFS to export as symlinks 14 - destination (string): where in gcs should these symlinks be written to 15 - mount.from (string): will be stripped from the physical address of objects when writing the symlink 16 - mount.to (string): will be prepended to the physical address of objects when writing the symlink 17 - write_current_marker (bool, default = true): if set to false, don't write a "current" symlink that points to the latest commit 18 - gcs_credentials_json_string (string): Google Cloud credentials to use when writing to symlink destination 19 20 21 Example hook declaration: (_lakefs_actions/export_images.yaml): 22 name: export_images 23 on: 24 post-commit: 25 branches: 26 - main 27 hooks: 28 - id: gcsfuse_export_images 29 type: lua 30 properties: 31 script_path: scripts/export_gcs_fuse.lua 32 args: 33 prefix: "datasets/images/" 34 destination: "gs://my-bucket/exports/my-repo/" 35 mount: 36 from: "gs://my-bucket/repos/my-repo/" 37 to: "/gcs/my-bucket/repos/my-repo/" 38 gcs_credentials_json_string: | 39 { 40 "client_id": "...", 41 "client_secret": "...", 42 "refresh_token": "...", 43 "type": "..." 44 } 45 ]] 46 47 gcloud = require("gcloud") 48 lakefs = require("lakefs") 49 path = require("path") 50 51 -- initialize client 52 print("initializing GS client") 53 gs = gcloud.gs_client(args.gcs_credentials_json_string) 54 55 -- get the current commit ID and ref 56 local current_commit = action.commit_id 57 tag_events = { ["pre-create-tag"] = true, ["post-create-tag"] = true } 58 branch_events = { ["pre-create-branch"] = true, ["post-create-branch"] = true, ["post-commit"] = true, ["post-merge"] = true } 59 local ref 60 local ref_type 61 if tag_events[action.event_type] then 62 ref = action.tag_id 63 ref_type = "tags" 64 elseif branch_events[action.event_type] then 65 ref = action.branch_id 66 ref_type = "branches" 67 else 68 error("unsupported event type: " .. action.event_type) 69 end 70 print("using ref_type = " .. ref_type .. ", ref = " .. ref) 71 72 local total = 0 73 local after = "" 74 local has_more = true 75 local out = path.join("/", args.destination, "commits", current_commit) 76 77 while has_more do 78 local code, resp = lakefs.list_objects(action.repository_id, current_commit, after, args.prefix, "") -- without delimiter 79 if code ~= 200 then 80 error("could not list path: " .. args.prefix .. ", error: " .. resp.message) 81 end 82 for _, entry in ipairs(resp.results) do 83 total = total + 1 84 gs.write_fuse_symlink( 85 entry.physical_address, 86 path.join("/", out, entry.path), 87 {["from"] = args.mount.from, ["to"] = args.mount.to}) 88 end 89 -- pagination 90 has_more = resp.pagination.has_more 91 after = resp.pagination.next_offset 92 end 93 94 print("-- done writing object symlinks (" .. total .. " total symlinks created) --") 95 96 if args["write_current_marker"] ~= false then 97 local marker = path.join("/", args.destination, ref_type, ref) 98 gs.write_fuse_symlink("../commits/" .. current_commit, marker, {}) 99 end