github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/test/spark/app/src/main/scala/Sonnets.scala (about) 1 import org.apache.spark.sql.{SaveMode, SparkSession} 2 import org.apache.hadoop.fs.{Path, RemoteIterator} 3 import org.apache.log4j.Logger 4 5 import scala.collection.mutable.ListBuffer 6 import scala.util.matching.Regex 7 8 object Sonnets { 9 // Make RemoteIterator a Scala iterator. Reformatted version of 10 // https://gist.github.com/timvw/4ec727de9b76d9afc51298d9e68c4241. 11 /** 12 * Converts RemoteIterator from Hadoop to Scala Iterator that provides all the familiar functions such as map, 13 * filter, foreach, etc. 14 * 15 * @param underlying The RemoteIterator that needs to be wrapped 16 * @tparam T Items inside the iterator 17 * @return Standard Scala Iterator 18 */ 19 def convertToScalaIterator[T](underlying: RemoteIterator[T]): Iterator[T] = { 20 case class wrapper(underlying: RemoteIterator[T]) extends Iterator[T] { 21 override def hasNext = underlying.hasNext 22 23 override def next = underlying.next 24 } 25 wrapper(underlying) 26 } 27 28 val logger = Logger.getLogger(getClass.getName) 29 30 def main(args: Array[String]): Unit = { 31 val spark = SparkSession 32 .builder() 33 .appName("SonnetsApp") 34 .getOrCreate() 35 val input = args.applyOrElse(0, (_: Any) => "s3a://example/main/sonnets.txt") 36 val output = args.applyOrElse(1, (_: Any) => "s3a://example/main/sonnets-wordcount") 37 val sc = spark.sparkContext 38 sc.setLogLevel("INFO") 39 import spark.implicits._ 40 val sonnets = sc.textFile(input) 41 val partitions = 7 42 val counts = sonnets 43 .flatMap(line => line.split(" ")) 44 .map(word => (word, 1)) 45 .reduceByKey(_ + _) 46 .map({ case (word, count) => (word, count, if (word != "") word.substring(0, 1) else "") }) 47 .toDF("word", "count", "partition_key") 48 .repartition(partitions, $"partition_key") 49 50 var failed = new ListBuffer[String] 51 52 for (fmt <- Seq("csv", "parquet", "json", "orc")) { 53 try { 54 // save data is selected format 55 val outputPath = s"${output}.${fmt}" 56 logger.info(s"Write word count - format:$fmt path:$outputPath") 57 val targetBase = counts.write.partitionBy("partition_key").mode(SaveMode.Overwrite).format(fmt) 58 val target = ( 59 if (fmt == "csv") 60 targetBase.option("inferSchema", "true").option("header", "true") 61 else targetBase 62 ) 63 target.save(outputPath) 64 65 { 66 /* 67 * Verify all files match one of: 68 * - s"${outputPath}/partition_key=PPP/XXXYYYZZZ.${fmt}" 69 * - s"${outputPath}/_SUCCESS" 70 */ 71 val pattern: Regex = s"${Regex.quote(outputPath)}/(partition_key=[^/]*/[^/]*\\.${Regex.quote(fmt)}|_SUCCESS)".r 72 val path = new Path(outputPath) 73 val fs = path.getFileSystem(sc.hadoopConfiguration) 74 val badFiles = convertToScalaIterator(fs.listFiles(path, true)). 75 map(file => file.getPath.toString). 76 filter(name => !(name matches pattern.toString)) 77 if (!badFiles.isEmpty) { 78 logger.error(s"Unexpected leftover files generating ${outputPath}: ${badFiles.mkString(", ")}") 79 failed = failed :+ fmt 80 } 81 } 82 83 // read the data we just wrote 84 logger.info(s"Read word count - format:$fmt path:$outputPath") 85 val sourceBase = spark.read.format(fmt) 86 val source = ( 87 if (fmt == "csv") 88 sourceBase.option("inferSchema", "true").option("header", "true") 89 else sourceBase 90 ) 91 val data = source.load(outputPath) 92 93 // filter word count for specific 'times' and match result 94 val expected = "can,or" 95 val times = "42" 96 val life = data 97 .filter($"count".cast("string") === times) // different formats use different types - compare as string 98 .map(_.getAs[String]("word")) 99 .collect 100 .sorted 101 .mkString(",") 102 if (life != expected) { 103 logger.error(s"Word count - format:$fmt times:$times matched:'$life' expected:'$expected'") 104 println(s"Words found $times times, '$life', doesn't match '$expected' (format:$fmt)") 105 failed = failed :+ fmt 106 } 107 } catch { 108 case e: (Exception) => 109 logger.error(s"Format $fmt unexpected exception: ${e}") 110 failed = failed :+ fmt 111 } 112 } 113 114 if (!failed.isEmpty) { 115 logger.error(s"FAIL formats: ${failed}\n") 116 System.exit(1) 117 } 118 } 119 }