github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/examples/hooks/dataset_validator.lua (about) 1 --[[ 2 3 Validate the existence of mandatory metadata describing a dataset. 4 A metadata file should exist either in the same directory as the modified dataset, or in any parent directory. 5 The closest metadata file would take precedence (i.e. same folder > parent > 2nd parent). 6 7 # Example hook definition (_lakefs_actions/validate_dataset_fields.yaml): 8 name: Validate Dataset Fields 9 description: Validate the existence of mandatory metadata describing a dataset. 10 on: 11 pre-merge: 12 branches: 13 - main 14 hooks: 15 - id: validate_datasets 16 type: lua 17 properties: 18 script_path: scripts/dataset_validator.lua 19 args: 20 prefix: 'datasets/' 21 metadata_file_name: dataset_metadata.yaml 22 fields: 23 - name: contains_pii 24 required: true 25 type: boolean 26 - name: approval_link 27 required: true 28 type: string 29 match_pattern: 'https?:\/\/.*' 30 - name: rank 31 required: true 32 type: number 33 - name: department 34 type: string 35 choices: ['hr', 'it', 'other'] 36 ]] 37 38 path = require("path") 39 regexp = require("regexp") 40 yaml = require("encoding/yaml") 41 42 lakefs = require("lakefs") 43 hook = require("hook") 44 45 function is_a_valid_choice(choices, value) 46 for _, c in ipairs(choices) do 47 if c == value then 48 return true 49 end 50 end 51 return false 52 end 53 54 function check_field(field_descriptor, value, filename) 55 -- check required but missing 56 if value == nil and field_descriptor.required then 57 hook.fail(filename .. ": field '" .. field_descriptor.name .. "' is required but no value given") 58 end 59 -- check type is correct 60 if field_descriptor.type ~= nil and type(value) ~= field_descriptor.type then 61 hook.fail(filename .. ": field '" .. field_descriptor.name .. "' should be of type " .. field_descriptor.type) 62 end 63 -- check choices 64 if field_descriptor.choices ~= nil and not is_a_valid_choice(field_descriptor.choices, value) then 65 hook.fail(filename .. ": field '" .. field_descriptor.name .. "' should be one of '" .. table.concat(field_descriptor.choices, ", ") .. "'") 66 end 67 -- check pattern 68 if field_descriptor.match_pattern ~= nil then 69 if value ~= nil and type(value) ~= "string" then 70 hook.fail(filename .. ": field " .. field_descriptor.name .. " should be text (got '" .. type(value) .. "') and match pattern '" .. field_descriptor.match_pattern .. "'") 71 elseif value ~= nil and not regexp.match(field_descriptor.match_pattern, value) then 72 hook.fail(filename .. ": field " .. field_descriptor.name .. " should match pattern '" .. field_descriptor.match_pattern .. "'") 73 end 74 end 75 end 76 77 78 -- main flow 79 after = "" 80 has_more = true 81 metadata_files = {} 82 while has_more do 83 local code, resp = lakefs.diff_refs(action.repository_id, action.branch_id, action.source_ref, after, args.prefix) 84 if code ~= 200 then 85 error("could not diff: " .. resp.message) 86 end 87 for _, result in pairs(resp.results) do 88 print("" .. result.type .. " " .. result.path) 89 if result.type == "added" then 90 should_check = true 91 valid = true 92 has_parent = true 93 current = result.path 94 descriptor_for_file = "" 95 96 -- find nearest metadata file 97 while has_parent do 98 parsed = path.parse(current) 99 if not parsed.parent or parsed.parent == "" then 100 has_parent = false 101 break 102 end 103 current_descriptor = path.join("/", parsed.parent, args.metadata_file_name) 104 -- check if this descriptor has already been cached 105 if metadata_files[current_descriptor] then 106 -- cache hit 107 descriptor_for_file = metadata_files[current_descriptor] 108 break 109 110 elseif metadata_files[current_descriptor] == nil then 111 -- cache miss 112 -- attempt to fetch it 113 code, body = lakefs.get_object(action.repository_id, action.source_ref, current_descriptor) 114 if code == 200 then 115 metadata_files[current_descriptor] = yaml.unmarshal(body) 116 descriptor_for_file = current_descriptor 117 break 118 elseif code ~= 404 then 119 error("failed to look up metadata file: '" .. current_descriptor .. "', HTTP " .. tostring(code)) 120 else 121 -- indicates this doesn't exist, no need to look it up again 122 metadata_files[current_descriptor] = false 123 end 124 end 125 126 current = parsed.parent 127 end 128 129 -- check if we found a descriptor 130 if descriptor_for_file == "" then 131 hook.fail("No dataset metadata found for file: " .. result.path) 132 end 133 end 134 end 135 -- pagination 136 has_more = resp.pagination.has_more 137 after = resp.pagination.next_offset 138 end 139 140 -- now let's review all the metadata files for this commit: 141 for metadata_filename, metadata_file in pairs(metadata_files) do 142 for _, field_descriptor in ipairs(args.fields) do 143 check_field(field_descriptor, metadata_file[field_descriptor.name], metadata_filename) 144 end 145 end