github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/test/lakefsfs/src/main/scala/io/lakefs/fs/LakeFSFS.scala (about)

     1  package io.lakefs.fs
     2  
     3  import io.lakefs.clients.api.RepositoriesApi
     4  import io.lakefs.clients.api.auth.HttpBasicAuth
     5  import io.lakefs.clients.api.model.RepositoryCreation
     6  import io.lakefs.clients.api.ApiException
     7  import org.apache.commons.cli.{BasicParser, Options}
     8  import org.apache.spark.sql.{RuntimeConfig, SparkSession}
     9  import org.slf4j.LoggerFactory
    10  
    11  object LakeFSFS {
    12    private val LOG = LoggerFactory.getLogger(getClass)
    13  
    14    def main(args: Array[String]): Unit = {
    15      val spark = SparkSession
    16        .builder()
    17        .appName("LakeFSFS")
    18        .getOrCreate()
    19  
    20      val options = new Options()
    21        .addOption("r", "repository", true, "Repository name")
    22        .addOption("b", "branch", true, "Branch name")
    23        .addOption("s", "storage-namespace", true, "Storage namespace")
    24        .addOption("a", "amazon-reviews", true, "Amazon Customer Reviews dataset location")
    25      val parser = new BasicParser
    26      val cmd = parser.parse(options, args)
    27      val repository = cmd.getOptionValue("r", "example")
    28      val branch = cmd.getOptionValue("b", "main")
    29      val storageNamespace = cmd.getOptionValue("s", s"s3://$repository")
    30      val sourcePath = cmd.getOptionValue("a", "s3a://amazon-reviews-pds/parquet")
    31  
    32      val sc = spark.sparkContext
    33      sc.setLogLevel("INFO")
    34  
    35      // use lakefs api client to create repository for this app
    36      val apiClient = makeApiClient(spark.conf)
    37  
    38      LOG.info("Create repository={}, branch={}, storageNamespace={}", repository, branch, storageNamespace)
    39      val repositoriesApi = new RepositoriesApi(apiClient)
    40      try {
    41        val repositoryCreation = new RepositoryCreation()
    42          .name(repository)
    43          .defaultBranch(branch)
    44          .storageNamespace(storageNamespace)
    45        repositoriesApi.createRepository(repositoryCreation, false)
    46        LOG.info("Repository created repository={}, branch={}, storageNamespace={}", repository, branch, storageNamespace)
    47      } catch {
    48        case e: ApiException => LOG.error("Create repository failed (code " + e.getCode() + ")", e)
    49      }
    50  
    51      LOG.info("Read data from {}", sourcePath)
    52      val pds = spark.read
    53        .parquet(sourcePath)
    54      pds.createOrReplaceTempView("pds")
    55  
    56      // select product id and title with number of reviews from 'Books' category and marketplace 'US'
    57      val productsReviewCount = spark.sql("SELECT product_id, product_title, year, COUNT(*) as num_reviews " +
    58        "FROM pds where product_category='Books' AND marketplace='US' " +
    59        "GROUP BY product_id, product_title, year")
    60  
    61      // write to lakeFS partition by 'year'
    62      val byYearPath = s"lakefs://$repository/$branch/amazon-reviews-pds/parquet"
    63      LOG.info("Write products by year to {}", byYearPath)
    64      productsReviewCount.write
    65        .partitionBy("year")
    66        .parquet(byYearPath)
    67  
    68      // read the data from lakeFS
    69      LOG.info("Read by year data and compute top 10 for 2015")
    70      val lfsPds = spark.read.parquet(byYearPath)
    71      lfsPds.createOrReplaceTempView("lfs_pds")
    72      val topTen = spark.sql("SELECT product_id, product_title, num_reviews " +
    73        "FROM lfs_pds where year=2015 " +
    74        "ORDER BY num_reviews DESC " +
    75        "LIMIT 10")
    76      topTen.collect.foreach(entry => {
    77        LOG.info("PRODUCT {}", entry)
    78      })
    79  
    80      spark.stop()
    81    }
    82  
    83    private def makeApiClient(conf: RuntimeConfig) = {
    84      // use lakefs hadoop fs configuration parameters
    85      LOG.info("Setup lakeFS API client")
    86      val accessKey = conf.get("spark.hadoop.fs.lakefs.access.key")
    87      val secretKey = conf.get("spark.hadoop.fs.lakefs.secret.key")
    88      val endpoint = conf.get("spark.hadoop.fs.lakefs.endpoint").stripSuffix("/")
    89      LOG.info("lakeFS API client endpoint={}", endpoint)
    90      val apiClient = io.lakefs.clients.api.Configuration.getDefaultApiClient
    91      apiClient.setBasePath(endpoint)
    92  
    93      val basicAuth = apiClient.getAuthentication("basic_auth").asInstanceOf[HttpBasicAuth]
    94      basicAuth.setUsername(accessKey)
    95      basicAuth.setPassword(secretKey)
    96  
    97      apiClient
    98    }
    99  }