github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/clients/hadoopfs/examples/spark_with_lakefs.py (about) 1 from pyspark.sql import SparkSession 2 import pyspark 3 import sys 4 5 spark = SparkSession.builder.appName("test_app").getOrCreate() 6 spark._jsc.hadoopConfiguration().set("fs.lakefs.access.key", "AKIAIOSFODNN7EXAMPLE") 7 spark._jsc.hadoopConfiguration().set("fs.lakefs.secret.key", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY") 8 sc = spark.sparkContext 9 sc.setLogLevel("DEBUG") 10 log4jLogger = sc._jvm.org.apache.log4j 11 log = log4jLogger.LogManager.getLogger(__name__) 12 13 # Write using the lakefs file system 14 samples = sc.parallelize([ 15 ("abonsanto@fakemail.com", "Alberto", "Bonsanto"), 16 ("dbonsanto@fakemail.com", "Dakota", "Bonsanto") 17 ]) 18 # samples.collect() 19 samples.saveAsTextFile("lakefs://example1/master/output.txt") 20 21 # samples.unpersist() 22 23 # Read file using the lakefs file system 24 lines = sc.textFile("lakefs://example1/master/input.txt") 25 lines.taks(10).foreach(log.trace) 26 27 sc.stop()