github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/test/spark/app/src/main/scala/Sonnets.scala

github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/test/spark/app/src/main/scala/Sonnets.scala (about)

     1  import org.apache.spark.sql.{SaveMode, SparkSession}
     2  import org.apache.hadoop.fs.{Path, RemoteIterator}
     3  import org.apache.log4j.Logger
     4  
     5  import scala.collection.mutable.ListBuffer
     6  import scala.util.matching.Regex
     7  
     8  object Sonnets {
     9    // Make RemoteIterator a Scala iterator.  Reformatted version of
    10    // https://gist.github.com/timvw/4ec727de9b76d9afc51298d9e68c4241.
    11    /**
    12     * Converts RemoteIterator from Hadoop to Scala Iterator that provides all the familiar functions such as map,
    13     * filter, foreach, etc.
    14     *
    15     * @param underlying The RemoteIterator that needs to be wrapped
    16     * @tparam T Items inside the iterator
    17     * @return Standard Scala Iterator
    18     */
    19    def convertToScalaIterator[T](underlying: RemoteIterator[T]): Iterator[T] = {
    20      case class wrapper(underlying: RemoteIterator[T]) extends Iterator[T] {
    21        override def hasNext = underlying.hasNext
    22  
    23        override def next = underlying.next
    24      }
    25      wrapper(underlying)
    26    }
    27  
    28    val logger = Logger.getLogger(getClass.getName)
    29  
    30    def main(args: Array[String]): Unit = {
    31      val spark = SparkSession
    32        .builder()
    33        .appName("SonnetsApp")
    34        .getOrCreate()
    35      val input = args.applyOrElse(0, (_: Any) => "s3a://example/main/sonnets.txt")
    36      val output = args.applyOrElse(1, (_: Any) => "s3a://example/main/sonnets-wordcount")
    37      val sc = spark.sparkContext
    38      sc.setLogLevel("INFO")
    39      import spark.implicits._
    40      val sonnets = sc.textFile(input)
    41      val partitions = 7
    42      val counts = sonnets
    43        .flatMap(line => line.split(" "))
    44        .map(word => (word, 1))
    45        .reduceByKey(_ + _)
    46        .map({ case (word, count) => (word, count, if (word != "") word.substring(0, 1) else "") })
    47        .toDF("word", "count", "partition_key")
    48        .repartition(partitions, $"partition_key")
    49  
    50      var failed = new ListBuffer[String]
    51  
    52      for (fmt <- Seq("csv", "parquet", "json", "orc")) {
    53        try {
    54          // save data is selected format
    55          val outputPath = s"${output}.${fmt}"
    56          logger.info(s"Write word count - format:$fmt path:$outputPath")
    57          val targetBase = counts.write.partitionBy("partition_key").mode(SaveMode.Overwrite).format(fmt)
    58          val target = (
    59            if (fmt == "csv")
    60              targetBase.option("inferSchema", "true").option("header", "true")
    61            else targetBase
    62          )
    63          target.save(outputPath)
    64  
    65          {
    66            /*
    67             *  Verify all files match one of:
    68             *  - s"${outputPath}/partition_key=PPP/XXXYYYZZZ.${fmt}"
    69             * -  s"${outputPath}/_SUCCESS"
    70             */
    71            val pattern: Regex = s"${Regex.quote(outputPath)}/(partition_key=[^/]*/[^/]*\\.${Regex.quote(fmt)}|_SUCCESS)".r
    72            val path = new Path(outputPath)
    73            val fs = path.getFileSystem(sc.hadoopConfiguration)
    74            val badFiles = convertToScalaIterator(fs.listFiles(path, true)).
    75              map(file => file.getPath.toString).
    76              filter(name => !(name matches pattern.toString))
    77            if (!badFiles.isEmpty) {
    78              logger.error(s"Unexpected leftover files generating ${outputPath}: ${badFiles.mkString(", ")}")
    79              failed = failed :+ fmt
    80            }
    81          }
    82  
    83          // read the data we just wrote
    84          logger.info(s"Read word count - format:$fmt path:$outputPath")
    85          val sourceBase = spark.read.format(fmt)
    86          val source = (
    87            if (fmt == "csv")
    88              sourceBase.option("inferSchema", "true").option("header", "true")
    89            else sourceBase
    90          )
    91          val data = source.load(outputPath)
    92  
    93          // filter word count for specific 'times' and match result
    94          val expected = "can,or"
    95          val times = "42"
    96          val life = data
    97            .filter($"count".cast("string") === times) // different formats use different types - compare as string
    98            .map(_.getAs[String]("word"))
    99            .collect
   100            .sorted
   101            .mkString(",")
   102          if (life != expected) {
   103            logger.error(s"Word count - format:$fmt times:$times matched:'$life' expected:'$expected'")
   104            println(s"Words found $times times, '$life',  doesn't match '$expected' (format:$fmt)")
   105            failed = failed :+ fmt
   106          }
   107        } catch {
   108          case e: (Exception) =>
   109            logger.error(s"Format $fmt unexpected exception: ${e}")
   110            failed = failed :+ fmt
   111        }
   112      }
   113  
   114      if (!failed.isEmpty) {
   115        logger.error(s"FAIL  formats: ${failed}\n")
   116        System.exit(1)
   117      }
   118    }
   119  }