github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/test/spark/run-test.py (about)

     1  import argparse
     2  import sys
     3  
     4  import lakefs_client
     5  from lakefs_client import models
     6  from lakefs_client.client import LakeFSClient
     7  from python_on_whales import docker
     8  from tenacity import retry, stop_after_attempt, wait_fixed
     9  
    10  
    11  def flatten(lst):
    12      return [item for sublist in lst for item in sublist]
    13  
    14  
    15  def get_spark_submit_cmd(submit_flags, spark_config, jar_name, jar_args):
    16      cmd = ["spark-submit", "--master", "spark://spark:7077"]
    17      cmd.extend(submit_flags)
    18      configs = flatten([['-c', f"{k}={v}"] for k, v in spark_config.items()])
    19      cmd.extend(configs)
    20      cmd.extend(["--class", "Sonnets", f"/local/app/target/{jar_name}"])
    21      cmd.extend(jar_args)
    22      return cmd
    23  
    24  
    25  @retry(wait=wait_fixed(1), stop=stop_after_attempt(7))
    26  def wait_for_setup(lfs_client):
    27      repositories = lfs_client.repositories.list_repositories()
    28      assert len(repositories.results) >= 0
    29  
    30  def main():
    31      parser = argparse.ArgumentParser()
    32      parser.add_argument("--storage_namespace", default="local://", required=True)
    33      parser.add_argument("--repository", default="example", required=True)
    34      parser.add_argument("--sonnet_jar", required=True)
    35      parser.add_argument("--client_version")
    36      parser.add_argument("--aws_access_key")
    37      parser.add_argument("--aws_secret_key")
    38      parser.add_argument("--redirect", action='store_true')
    39      parser.add_argument("--access_mode", choices=["s3_gateway", "hadoopfs", "hadoopfs_presigned"], default="s3_gateway")
    40      parser.add_argument("--region",)
    41      lakefs_access_key = 'AKIAIOSFODNN7EXAMPLE'
    42      lakefs_secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
    43  
    44      args = parser.parse_args()
    45      if args.client_version:
    46          submit_flags = ["--packages", f"io.lakefs:hadoop-lakefs-assembly:{args.client_version}"]
    47      else:
    48          submit_flags = ["--jars", "/target/client.jar"]
    49  
    50      lfs_client = LakeFSClient(
    51          lakefs_client.Configuration(username=lakefs_access_key,
    52                                      password=lakefs_secret_key,
    53                                      host='http://localhost:8000'))
    54      wait_for_setup(lfs_client)
    55      lfs_client.repositories.create_repository(
    56          models.RepositoryCreation(name=args.repository,
    57                                    storage_namespace=args.storage_namespace,
    58                                    default_branch='main',))
    59  
    60      with open('./app/data-sets/sonnets.txt', 'rb') as f:
    61          lfs_client.objects.upload_object(repository=args.repository, branch="main", path="sonnets.txt", content=f)
    62      base_hadoopfs_config = {
    63          "spark.hadoop.fs.lakefs.impl": "io.lakefs.LakeFSFileSystem",
    64          "spark.driver.extraJavaOptions": "-Dcom.amazonaws.services.s3.enableV4=true",
    65          "spark.executor.extraJavaOptions": "-Dcom.amazonaws.services.s3.enableV4=true",
    66          "spark.hadoop.fs.lakefs.endpoint": "http://lakefs:8000/api/v1",
    67          "spark.hadoop.fs.lakefs.access.key": lakefs_access_key,
    68          "spark.hadoop.fs.lakefs.secret.key": lakefs_secret_key,
    69      }
    70  
    71      if args.access_mode == 'hadoopfs':
    72          scheme = "lakefs"
    73          spark_configs = {
    74              **base_hadoopfs_config,
    75              "spark.hadoop.fs.s3a.access.key": args.aws_access_key,
    76              "spark.hadoop.fs.s3a.secret.key": args.aws_secret_key,
    77              "spark.hadoop.fs.s3a.region": args.region,
    78          }
    79  
    80      elif args.access_mode == 'hadoopfs_presigned':
    81          scheme = "lakefs"
    82          spark_configs = {
    83              **base_hadoopfs_config,
    84              "spark.hadoop.fs.lakefs.access.mode": "presigned",
    85          }
    86      else:
    87          scheme = "s3a"
    88          spark_configs = {"spark.hadoop.fs.s3a.access.key": lakefs_access_key,
    89                           "spark.hadoop.fs.s3a.secret.key": lakefs_secret_key,
    90                           "spark.hadoop.fs.s3a.endpoint": "s3.docker.lakefs.io:8000",
    91                           "spark.hadoop.fs.s3a.connection.ssl.enabled": "false"}
    92          if args.redirect:
    93              spark_configs["spark.hadoop.fs.s3a.path.style.access"] = "true"
    94              spark_configs[f"spark.hadoop.fs.s3a.signing-algorithm"] = "QueryStringSignerType"
    95              spark_configs[f"spark.hadoop.fs.s3a.user.agent.prefix"] = "s3RedirectionSupport"
    96  
    97      generator = docker.compose.run("spark-submit",
    98                                     get_spark_submit_cmd(submit_flags, spark_configs, args.sonnet_jar,
    99                                                          [f"{scheme}://{args.repository}/main/sonnets.txt",
   100                                                           f"{scheme}://{args.repository}/main/sonnets-wordcount"]),
   101                                     dependencies=False,
   102                                     tty=False,
   103                                     stream=True,
   104                                     name="submit")
   105  
   106      for _, stream_content in generator:
   107          print(stream_content.decode(), end="")
   108      state = docker.container.inspect("submit").state
   109      if state.exit_code != 0:
   110          print(state.error)
   111      docker.container.remove("submit")
   112      sys.exit(state.exit_code)
   113  
   114  
   115  if __name__ == '__main__':
   116      main()