github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/examples/hooks/parquet_schema_validator.lua (about) 1 2 --[[ 3 Parquet schema Validator 4 5 Args: 6 - locations (list of strings): locations to look for parquet files under 7 - sample (boolean): whether reading one new/changed file per directory is enough, or go through all of them 8 9 Example hook declaration: (_lakefs_actions/pre-merge-schema-validation.yaml): 10 11 name: pre merge format check on main 12 on: 13 pre-merge: 14 branches: 15 - main 16 hooks: 17 - id: check_formats 18 type: lua 19 properties: 20 script_path: scripts/parquet_schema_validator.lua # location of this script in the repository! 21 args: 22 sample: true 23 column_block_list: ["user_id", "email", "ssn", "private_*"] 24 locations: 25 - tables/users/ 26 - tables/sales/ 27 - prod/ 28 ]] 29 30 31 lakefs = require("lakefs") 32 strings = require("strings") 33 parquet = require("encoding/parquet") 34 regexp = require("regexp") 35 path = require("path") 36 37 38 visited_directories = {} 39 40 for _, location in ipairs(args.locations) do 41 after = "" 42 has_more = true 43 need_more = true 44 print("checking location: " .. location) 45 while has_more do 46 print("running diff, location = " .. location .. " after = " .. after) 47 local code, resp = lakefs.diff_refs(action.repository_id, action.branch_id, action.source_ref, after, location) 48 if code ~= 200 then 49 error("could not diff: " .. resp.message) 50 end 51 52 for _, result in pairs(resp.results) do 53 p = path.parse(result.path) 54 print("checking: '" .. result.path .. "'") 55 if not args.sample or (p.parent and not visited_directories[p.parent]) then 56 if result.path_type == "object" and result.type ~= "removed" then 57 if strings.has_suffix(p.base_name, ".parquet") then 58 -- check it! 59 code, content = lakefs.get_object(action.repository_id, action.source_ref, result.path) 60 if code ~= 200 then 61 error("could not fetch data file: HTTP " .. tostring(code) .. "body:\n" .. content) 62 end 63 schema = parquet.get_schema(content) 64 for _, column in ipairs(schema) do 65 for _, pattern in ipairs(args.column_block_list) do 66 if regexp.match(pattern, column.name) then 67 error("Column is not allowed: '" .. column.name .. "': type: " .. column.type .. " in path: " .. result.path) 68 end 69 end 70 end 71 print("\t all columns are valid") 72 visited_directories[p.parent] = true 73 end 74 end 75 else 76 print("\t skipping path, directory already sampled") 77 end 78 end 79 80 -- pagination 81 has_more = resp.pagination.has_more 82 after = resp.pagination.next_offset 83 end 84 end