github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/examples/hooks/s3_hive_manifest_exporter.lua (about) 1 --[[ 2 Automatic Symlink Exporter 3 4 Args: 5 - aws_access_key_id, aws_secret_access_key, aws_region (string): configuration passed to the s3 client that writes symlinks 6 - export_bucket (string): bucket to write symlinks to 7 - export_path (string): path in the bucket to write symlinks to 8 - sources ([]string): lakeFS paths that should be written as symlinks 9 10 Example hook declaration: (_lakefs_actions/auto-symlinks.yaml): 11 12 name: auto symlink 13 on: 14 post-create-branch: 15 branches: ["view-*"] 16 post-commit: 17 branches: ["view-*"] 18 hooks: 19 - id: symlink_creator 20 type: lua 21 properties: 22 script_path: scripts/s3_hive_manifest_exporter.lua 23 args: 24 # Export configuration 25 aws_access_key_id: "AKIA..." 26 aws_secret_access_key: "..." 27 aws_region: us-east-1 28 export_bucket: oz-repo 29 export_path: lakefs_tables 30 sources: 31 - tables/my-table/ 32 ]] 33 34 aws = require("aws") 35 lakefs = require("lakefs") 36 path = require("path") 37 path_sep = path.default_separator() 38 39 s3 = aws.s3_client(args.aws_access_key_id, args.aws_secret_access_key, args.aws_region) 40 41 tag_events = { ["pre-create-tag"] = true, ["post-create-tag"] = true } 42 branch_events = { ["pre-create-branch"] = true, ["post-create-branch"] = true } 43 commit_events = { ["post-commit"] = true, ["post-merge"] = true } 44 45 local current_commit = action.commit_id 46 local ref 47 if tag_events[action.event_type] then 48 ref = action.tag_id 49 elseif branch_events[action.event_type] then 50 ref = action.branch_id 51 elseif commit_events[action.event_type] then 52 ref = action.branch_id 53 else 54 error("unsupported event type: " .. action.event_type) 55 end 56 -- root export path for the current repository 57 export_path = path.join(path_sep, args.export_path, "repositories", action.repository_id) 58 59 for _, location in ipairs(args.sources) do 60 location_export_path = path.join(path_sep, export_path, "refs", ref, location) 61 start_marker = path.join(path_sep, location_export_path, "_start_commit_id") 62 end_marker = path.join(path_sep, location_export_path, "_completed_commit_id") 63 -- read start_commit from S3 64 start_commit, exists = s3.get_object(args.export_bucket, start_marker) 65 if not exists then 66 -- no commit marker 67 print("no _start_commit_id found for location '" .. location .. "'") 68 start_commit = nil 69 end 70 -- read end_commit from S3 71 end_commit, exists = s3.get_object(args.export_bucket, end_marker) 72 if not exists then 73 -- no commit marker 74 print("no _completed_commit_id found for location '" .. location .. "'") 75 end_commit = nil 76 end 77 78 clean_mode = false 79 if (not start_commit or not end_commit) or (start_commit ~= end_commit) then 80 -- we need to clean up and start from scratch 81 print("going into clean mode for location '" .. location .. "', deleting export path s3://" .. args.export_bucket .. "/" .. location_export_path) 82 s3.delete_recursive(args.export_bucket, location_export_path) 83 clean_mode = true 84 end 85 -- write start_commit 86 print("writing _start_commit_id: " .. current_commit) 87 s3.put_object(args.export_bucket, start_marker, current_commit) 88 89 if clean_mode then 90 -- instead of diffing, list the things and gather prefixes 91 local after = "" 92 local has_more = true 93 local current_subloc = "" 94 local current_files = {} 95 while has_more do 96 local code, resp = lakefs.list_objects(action.repository_id, current_commit, after, location, "") -- without delimiter 97 if code ~= 200 then 98 error("could not list path: " .. location .. ", error: " .. resp.message) 99 end 100 for _, entry in ipairs(resp.results) do 101 p = path.parse(entry.path) 102 -- did we move on to the next dir? 103 if p.parent ~= current_subloc then 104 -- we moved on to a new directory! let's flush the previous one 105 if #current_files > 0 then 106 symlink_path = path.join(path_sep, location_export_path, current_subloc, "symlink.txt") 107 print("writing symlink file for " .. symlink_path) 108 s3.put_object(args.export_bucket, symlink_path, table.concat(current_files, "\n")) 109 end 110 -- done, updated current dir 111 current_subloc = p.parent 112 current_files = {} 113 end 114 -- add physical address 115 if not path.is_hidden(entry.path) then 116 table.insert(current_files, entry.physical_address) 117 end 118 119 end 120 121 -- pagination 122 has_more = resp.pagination.has_more 123 after = resp.pagination.next_offset 124 end 125 -- do we have anything left to flush? 126 if #current_files > 0 then 127 symlink_path = path.join(path_sep, location_export_path, current_subloc, "symlink.txt") 128 print("writing symlink file for " .. symlink_path) 129 s3.put_object(args.export_bucket, symlink_path, table.concat(current_files, "\n")) 130 end 131 else 132 -- diff start_commit with current_commit 133 dirty_locations = {} 134 local has_more = true 135 local after = "" 136 while has_more do 137 print("diffing. current commit = " .. current_commit .. ", start commit = " .. start_commit .. ", after = " .. after .. ", location = " .. location) 138 local code, resp = lakefs.diff_refs(action.repository_id, start_commit, current_commit, after, location, "") -- recursive 139 if code ~= 200 then 140 error("could not diff path: " .. location .. ", error: " .. resp.message) 141 end 142 -- for every modified_prefix 143 print("\t got " .. tostring(#resp.results) .. " results, iterating") 144 for _, entry in ipairs(resp.results) do 145 p = path.parse(entry.path) 146 if dirty_locations[#dirty_locations] ~= p.parent then 147 print("adding 'dirty' location: " .. p.parent) 148 table.insert(dirty_locations, p.parent) 149 end 150 end 151 -- pagination 152 has_more = resp.pagination.has_more 153 after = resp.pagination.next_offset 154 end 155 156 -- now, for every dirty location, regenerate its symlink 157 for _, subloc in ipairs(dirty_locations) do 158 local has_more = true 159 local after = "" 160 local current_entries = {} 161 while has_more do 162 local code, resp = lakefs.list_objects(action.repository_id, current_commit, after, subloc, "") -- without delimiter 163 if code ~= 200 then 164 error("could not list path: " .. subloc .. ", error: " .. resp.message) 165 end 166 for _, entry in ipairs(resp.results) do 167 if not path.is_hidden(entry.path) then 168 table.insert(current_entries, entry.physical_address) 169 end 170 end 171 -- pagination 172 has_more = resp.pagination.has_more 173 after = resp.pagination.next_offset 174 end 175 symlink_path = path.join(path_sep, location_export_path, subloc, "symlink.txt") 176 if #current_entries == 0 then 177 print("removing stale symlink path: " .. symlink_path) 178 s3.delete_object(args.export_bucket, symlink_path) 179 else 180 print("writing symlink path: " .. symlink_path) 181 s3.put_object(args.export_bucket, symlink_path, table.concat(current_entries, "\n")) 182 end 183 end 184 185 end 186 -- done with location! write end_marker 187 s3.put_object(args.export_bucket, end_marker, current_commit) 188 print("done! wrote _completed_commit_id: " .. current_commit) 189 end