github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/examples/hooks/dataset_validator.lua (about)

     1  --[[
     2  
     3  Validate the existence of mandatory metadata describing a dataset.
     4  A metadata file should exist either in the same directory as the modified dataset, or in any parent directory.
     5  The closest metadata file would take precedence (i.e. same folder > parent > 2nd parent).
     6  
     7  # Example hook definition (_lakefs_actions/validate_dataset_fields.yaml):
     8  name: Validate Dataset Fields
     9  description: Validate the existence of mandatory metadata describing a dataset.
    10  on:
    11    pre-merge:
    12      branches:
    13        - main
    14  hooks:
    15    - id: validate_datasets
    16      type: lua
    17      properties:
    18        script_path: scripts/dataset_validator.lua
    19        args:
    20          prefix: 'datasets/'
    21          metadata_file_name: dataset_metadata.yaml
    22          fields:
    23            - name: contains_pii
    24              required: true
    25              type: boolean
    26            - name: approval_link
    27              required: true
    28              type: string
    29              match_pattern: 'https?:\/\/.*'
    30            - name: rank
    31              required: true
    32              type: number
    33            - name: department
    34              type: string
    35              choices: ['hr', 'it', 'other']
    36  ]]
    37  
    38  path = require("path")
    39  regexp = require("regexp")
    40  yaml = require("encoding/yaml")
    41  
    42  lakefs = require("lakefs")
    43  hook = require("hook")
    44  
    45  function is_a_valid_choice(choices, value)
    46      for _, c in ipairs(choices) do
    47          if c == value then
    48              return true
    49          end
    50      end
    51      return false
    52  end
    53  
    54  function check_field(field_descriptor, value, filename)
    55      -- check required but missing
    56      if value == nil and field_descriptor.required then
    57          hook.fail(filename .. ": field '" .. field_descriptor.name .. "' is required but no value given")
    58      end
    59      -- check type is correct
    60      if field_descriptor.type ~= nil and type(value) ~= field_descriptor.type then
    61          hook.fail(filename .. ": field '" .. field_descriptor.name .. "' should be of type " .. field_descriptor.type)
    62      end
    63      -- check choices
    64      if field_descriptor.choices ~= nil and not is_a_valid_choice(field_descriptor.choices, value) then
    65          hook.fail(filename .. ": field '" .. field_descriptor.name .. "' should be one of '" .. table.concat(field_descriptor.choices, ", ") .. "'")
    66      end
    67      -- check pattern
    68      if field_descriptor.match_pattern ~= nil then
    69          if value ~= nil and type(value) ~= "string" then
    70              hook.fail(filename .. ": field " .. field_descriptor.name .. " should be text (got '" .. type(value) .. "') and match pattern '" .. field_descriptor.match_pattern .. "'")
    71          elseif value ~= nil and not regexp.match(field_descriptor.match_pattern, value) then
    72              hook.fail(filename .. ": field " .. field_descriptor.name .. " should match pattern '" .. field_descriptor.match_pattern .. "'")
    73          end
    74      end
    75  end
    76  
    77  
    78  -- main flow
    79  after = ""
    80  has_more = true
    81  metadata_files = {}
    82  while has_more do
    83      local code, resp = lakefs.diff_refs(action.repository_id, action.branch_id, action.source_ref, after, args.prefix)
    84      if code ~= 200 then
    85          error("could not diff: " .. resp.message)
    86      end
    87      for _, result in pairs(resp.results) do
    88          print("" .. result.type .. " " .. result.path)
    89          if result.type == "added" then
    90              should_check = true
    91              valid = true
    92              has_parent = true
    93              current = result.path
    94              descriptor_for_file = ""
    95  
    96              -- find nearest metadata file
    97              while has_parent do
    98                  parsed = path.parse(current)
    99                  if not parsed.parent or parsed.parent == "" then
   100                      has_parent = false
   101                      break
   102                  end
   103                  current_descriptor = path.join("/", parsed.parent, args.metadata_file_name)
   104                  -- check if this descriptor has already been cached
   105                  if metadata_files[current_descriptor] then
   106                      -- cache hit
   107                      descriptor_for_file = metadata_files[current_descriptor]
   108                      break
   109  
   110                  elseif metadata_files[current_descriptor] == nil then
   111                      -- cache miss
   112                      -- attempt to fetch it
   113                      code, body = lakefs.get_object(action.repository_id, action.source_ref, current_descriptor)
   114                      if code == 200 then
   115                          metadata_files[current_descriptor] = yaml.unmarshal(body)
   116                          descriptor_for_file = current_descriptor
   117                          break
   118                      elseif code ~= 404 then
   119                          error("failed to look up metadata file: '" .. current_descriptor .. "', HTTP " .. tostring(code))
   120                      else
   121                          -- indicates this doesn't exist, no need to look it up again
   122                          metadata_files[current_descriptor] = false
   123                      end
   124                  end
   125  
   126                  current = parsed.parent
   127              end
   128  
   129              -- check if we found a descriptor
   130              if descriptor_for_file == "" then
   131                  hook.fail("No dataset metadata found for file: " .. result.path)
   132              end
   133          end
   134      end
   135      -- pagination
   136      has_more = resp.pagination.has_more
   137      after = resp.pagination.next_offset
   138  end
   139  
   140  -- now let's review all the metadata files for this commit:
   141  for metadata_filename, metadata_file in pairs(metadata_files) do
   142      for _, field_descriptor in ipairs(args.fields) do
   143          check_field(field_descriptor, metadata_file[field_descriptor.name], metadata_filename)
   144      end
   145  end