github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/test/spark/run-test.py (about) 1 import argparse 2 import sys 3 4 import lakefs_client 5 from lakefs_client import models 6 from lakefs_client.client import LakeFSClient 7 from python_on_whales import docker 8 from tenacity import retry, stop_after_attempt, wait_fixed 9 10 11 def flatten(lst): 12 return [item for sublist in lst for item in sublist] 13 14 15 def get_spark_submit_cmd(submit_flags, spark_config, jar_name, jar_args): 16 cmd = ["spark-submit", "--master", "spark://spark:7077"] 17 cmd.extend(submit_flags) 18 configs = flatten([['-c', f"{k}={v}"] for k, v in spark_config.items()]) 19 cmd.extend(configs) 20 cmd.extend(["--class", "Sonnets", f"/local/app/target/{jar_name}"]) 21 cmd.extend(jar_args) 22 return cmd 23 24 25 @retry(wait=wait_fixed(1), stop=stop_after_attempt(7)) 26 def wait_for_setup(lfs_client): 27 repositories = lfs_client.repositories.list_repositories() 28 assert len(repositories.results) >= 0 29 30 def main(): 31 parser = argparse.ArgumentParser() 32 parser.add_argument("--storage_namespace", default="local://", required=True) 33 parser.add_argument("--repository", default="example", required=True) 34 parser.add_argument("--sonnet_jar", required=True) 35 parser.add_argument("--client_version") 36 parser.add_argument("--aws_access_key") 37 parser.add_argument("--aws_secret_key") 38 parser.add_argument("--redirect", action='store_true') 39 parser.add_argument("--access_mode", choices=["s3_gateway", "hadoopfs", "hadoopfs_presigned"], default="s3_gateway") 40 parser.add_argument("--region",) 41 lakefs_access_key = 'AKIAIOSFODNN7EXAMPLE' 42 lakefs_secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' 43 44 args = parser.parse_args() 45 if args.client_version: 46 submit_flags = ["--packages", f"io.lakefs:hadoop-lakefs-assembly:{args.client_version}"] 47 else: 48 submit_flags = ["--jars", "/target/client.jar"] 49 50 lfs_client = LakeFSClient( 51 lakefs_client.Configuration(username=lakefs_access_key, 52 password=lakefs_secret_key, 53 host='http://localhost:8000')) 54 wait_for_setup(lfs_client) 55 lfs_client.repositories.create_repository( 56 models.RepositoryCreation(name=args.repository, 57 storage_namespace=args.storage_namespace, 58 default_branch='main',)) 59 60 with open('./app/data-sets/sonnets.txt', 'rb') as f: 61 lfs_client.objects.upload_object(repository=args.repository, branch="main", path="sonnets.txt", content=f) 62 base_hadoopfs_config = { 63 "spark.hadoop.fs.lakefs.impl": "io.lakefs.LakeFSFileSystem", 64 "spark.driver.extraJavaOptions": "-Dcom.amazonaws.services.s3.enableV4=true", 65 "spark.executor.extraJavaOptions": "-Dcom.amazonaws.services.s3.enableV4=true", 66 "spark.hadoop.fs.lakefs.endpoint": "http://lakefs:8000/api/v1", 67 "spark.hadoop.fs.lakefs.access.key": lakefs_access_key, 68 "spark.hadoop.fs.lakefs.secret.key": lakefs_secret_key, 69 } 70 71 if args.access_mode == 'hadoopfs': 72 scheme = "lakefs" 73 spark_configs = { 74 **base_hadoopfs_config, 75 "spark.hadoop.fs.s3a.access.key": args.aws_access_key, 76 "spark.hadoop.fs.s3a.secret.key": args.aws_secret_key, 77 "spark.hadoop.fs.s3a.region": args.region, 78 } 79 80 elif args.access_mode == 'hadoopfs_presigned': 81 scheme = "lakefs" 82 spark_configs = { 83 **base_hadoopfs_config, 84 "spark.hadoop.fs.lakefs.access.mode": "presigned", 85 } 86 else: 87 scheme = "s3a" 88 spark_configs = {"spark.hadoop.fs.s3a.access.key": lakefs_access_key, 89 "spark.hadoop.fs.s3a.secret.key": lakefs_secret_key, 90 "spark.hadoop.fs.s3a.endpoint": "s3.docker.lakefs.io:8000", 91 "spark.hadoop.fs.s3a.connection.ssl.enabled": "false"} 92 if args.redirect: 93 spark_configs["spark.hadoop.fs.s3a.path.style.access"] = "true" 94 spark_configs[f"spark.hadoop.fs.s3a.signing-algorithm"] = "QueryStringSignerType" 95 spark_configs[f"spark.hadoop.fs.s3a.user.agent.prefix"] = "s3RedirectionSupport" 96 97 generator = docker.compose.run("spark-submit", 98 get_spark_submit_cmd(submit_flags, spark_configs, args.sonnet_jar, 99 [f"{scheme}://{args.repository}/main/sonnets.txt", 100 f"{scheme}://{args.repository}/main/sonnets-wordcount"]), 101 dependencies=False, 102 tty=False, 103 stream=True, 104 name="submit") 105 106 for _, stream_content in generator: 107 print(stream_content.decode(), end="") 108 state = docker.container.inspect("submit").state 109 if state.exit_code != 0: 110 print(state.error) 111 docker.container.remove("submit") 112 sys.exit(state.exit_code) 113 114 115 if __name__ == '__main__': 116 main()