github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/examples/hooks/parquet_schema_validator.lua (about)

     1  
     2  --[[
     3  Parquet schema Validator
     4  
     5  Args:
     6   - locations (list of strings): locations to look for parquet files under
     7   - sample (boolean): whether reading one new/changed file per directory is enough, or go through all of them
     8  
     9  Example hook declaration: (_lakefs_actions/pre-merge-schema-validation.yaml):
    10  
    11  name: pre merge format check on main
    12  on:
    13    pre-merge:
    14      branches:
    15        - main
    16  hooks:
    17    - id: check_formats
    18      type: lua
    19      properties:
    20        script_path: scripts/parquet_schema_validator.lua # location of this script in the repository!
    21        args:
    22          sample: true
    23          column_block_list: ["user_id", "email", "ssn", "private_*"]
    24          locations:
    25            - tables/users/
    26            - tables/sales/
    27            - prod/
    28  ]]
    29  
    30  
    31  lakefs = require("lakefs")
    32  strings = require("strings")
    33  parquet = require("encoding/parquet")
    34  regexp = require("regexp")
    35  path = require("path")
    36  
    37  
    38  visited_directories = {}
    39  
    40  for _, location in ipairs(args.locations) do
    41      after = ""
    42      has_more = true
    43      need_more = true
    44      print("checking location: " .. location)
    45      while has_more do
    46          print("running diff, location = " .. location .. " after = " .. after)
    47          local code, resp = lakefs.diff_refs(action.repository_id, action.branch_id, action.source_ref, after, location)
    48          if code ~= 200 then
    49              error("could not diff: " .. resp.message)
    50          end
    51  
    52          for _, result in pairs(resp.results) do
    53              p = path.parse(result.path)
    54              print("checking: '" .. result.path .. "'")
    55              if not args.sample or (p.parent and not visited_directories[p.parent]) then
    56                  if result.path_type == "object" and result.type ~= "removed" then
    57                      if strings.has_suffix(p.base_name, ".parquet") then
    58                          -- check it!
    59                          code, content = lakefs.get_object(action.repository_id, action.source_ref, result.path)
    60                          if code ~= 200 then
    61                              error("could not fetch data file: HTTP " .. tostring(code) .. "body:\n" .. content)
    62                          end
    63                          schema = parquet.get_schema(content)
    64                          for _, column in ipairs(schema) do
    65                              for _, pattern in ipairs(args.column_block_list) do
    66                                  if regexp.match(pattern, column.name) then
    67                                      error("Column is not allowed: '" .. column.name .. "': type: " .. column.type .. " in path: " .. result.path)
    68                                  end
    69                              end
    70                          end
    71                          print("\t all columns are valid")
    72                          visited_directories[p.parent] = true
    73                      end
    74                  end
    75              else
    76                  print("\t skipping path, directory already sampled")
    77              end
    78          end
    79  
    80          -- pagination
    81          has_more = resp.pagination.has_more
    82          after = resp.pagination.next_offset
    83      end
    84  end