github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/clients/hadoopfs/examples/spark_with_lakefs.py (about)

     1  from pyspark.sql import SparkSession
     2  import pyspark
     3  import sys
     4  
     5  spark = SparkSession.builder.appName("test_app").getOrCreate()
     6  spark._jsc.hadoopConfiguration().set("fs.lakefs.access.key", "AKIAIOSFODNN7EXAMPLE")
     7  spark._jsc.hadoopConfiguration().set("fs.lakefs.secret.key", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY")
     8  sc = spark.sparkContext
     9  sc.setLogLevel("DEBUG")
    10  log4jLogger = sc._jvm.org.apache.log4j
    11  log = log4jLogger.LogManager.getLogger(__name__)
    12  
    13  # Write using the lakefs file system
    14  samples = sc.parallelize([
    15      ("abonsanto@fakemail.com", "Alberto", "Bonsanto"),
    16      ("dbonsanto@fakemail.com", "Dakota", "Bonsanto")
    17  ])
    18  # samples.collect()
    19  samples.saveAsTextFile("lakefs://example1/master/output.txt")
    20  
    21  # samples.unpersist()
    22  
    23  # Read file using the lakefs file system
    24  lines = sc.textFile("lakefs://example1/master/input.txt")
    25  lines.taks(10).foreach(log.trace)
    26  
    27  sc.stop()