github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/test/lakefsfs/src/main/scala/io/lakefs/fs/LakeFSFS.scala (about) 1 package io.lakefs.fs 2 3 import io.lakefs.clients.api.RepositoriesApi 4 import io.lakefs.clients.api.auth.HttpBasicAuth 5 import io.lakefs.clients.api.model.RepositoryCreation 6 import io.lakefs.clients.api.ApiException 7 import org.apache.commons.cli.{BasicParser, Options} 8 import org.apache.spark.sql.{RuntimeConfig, SparkSession} 9 import org.slf4j.LoggerFactory 10 11 object LakeFSFS { 12 private val LOG = LoggerFactory.getLogger(getClass) 13 14 def main(args: Array[String]): Unit = { 15 val spark = SparkSession 16 .builder() 17 .appName("LakeFSFS") 18 .getOrCreate() 19 20 val options = new Options() 21 .addOption("r", "repository", true, "Repository name") 22 .addOption("b", "branch", true, "Branch name") 23 .addOption("s", "storage-namespace", true, "Storage namespace") 24 .addOption("a", "amazon-reviews", true, "Amazon Customer Reviews dataset location") 25 val parser = new BasicParser 26 val cmd = parser.parse(options, args) 27 val repository = cmd.getOptionValue("r", "example") 28 val branch = cmd.getOptionValue("b", "main") 29 val storageNamespace = cmd.getOptionValue("s", s"s3://$repository") 30 val sourcePath = cmd.getOptionValue("a", "s3a://amazon-reviews-pds/parquet") 31 32 val sc = spark.sparkContext 33 sc.setLogLevel("INFO") 34 35 // use lakefs api client to create repository for this app 36 val apiClient = makeApiClient(spark.conf) 37 38 LOG.info("Create repository={}, branch={}, storageNamespace={}", repository, branch, storageNamespace) 39 val repositoriesApi = new RepositoriesApi(apiClient) 40 try { 41 val repositoryCreation = new RepositoryCreation() 42 .name(repository) 43 .defaultBranch(branch) 44 .storageNamespace(storageNamespace) 45 repositoriesApi.createRepository(repositoryCreation, false) 46 LOG.info("Repository created repository={}, branch={}, storageNamespace={}", repository, branch, storageNamespace) 47 } catch { 48 case e: ApiException => LOG.error("Create repository failed (code " + e.getCode() + ")", e) 49 } 50 51 LOG.info("Read data from {}", sourcePath) 52 val pds = spark.read 53 .parquet(sourcePath) 54 pds.createOrReplaceTempView("pds") 55 56 // select product id and title with number of reviews from 'Books' category and marketplace 'US' 57 val productsReviewCount = spark.sql("SELECT product_id, product_title, year, COUNT(*) as num_reviews " + 58 "FROM pds where product_category='Books' AND marketplace='US' " + 59 "GROUP BY product_id, product_title, year") 60 61 // write to lakeFS partition by 'year' 62 val byYearPath = s"lakefs://$repository/$branch/amazon-reviews-pds/parquet" 63 LOG.info("Write products by year to {}", byYearPath) 64 productsReviewCount.write 65 .partitionBy("year") 66 .parquet(byYearPath) 67 68 // read the data from lakeFS 69 LOG.info("Read by year data and compute top 10 for 2015") 70 val lfsPds = spark.read.parquet(byYearPath) 71 lfsPds.createOrReplaceTempView("lfs_pds") 72 val topTen = spark.sql("SELECT product_id, product_title, num_reviews " + 73 "FROM lfs_pds where year=2015 " + 74 "ORDER BY num_reviews DESC " + 75 "LIMIT 10") 76 topTen.collect.foreach(entry => { 77 LOG.info("PRODUCT {}", entry) 78 }) 79 80 spark.stop() 81 } 82 83 private def makeApiClient(conf: RuntimeConfig) = { 84 // use lakefs hadoop fs configuration parameters 85 LOG.info("Setup lakeFS API client") 86 val accessKey = conf.get("spark.hadoop.fs.lakefs.access.key") 87 val secretKey = conf.get("spark.hadoop.fs.lakefs.secret.key") 88 val endpoint = conf.get("spark.hadoop.fs.lakefs.endpoint").stripSuffix("/") 89 LOG.info("lakeFS API client endpoint={}", endpoint) 90 val apiClient = io.lakefs.clients.api.Configuration.getDefaultApiClient 91 apiClient.setBasePath(endpoint) 92 93 val basicAuth = apiClient.getAuthentication("basic_auth").asInstanceOf[HttpBasicAuth] 94 basicAuth.setUsername(accessKey) 95 basicAuth.setPassword(secretKey) 96 97 apiClient 98 } 99 }