github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/test/spark/s3a-multipart/src/main/scala/S3AMultipart.scala (about)

     1  package io.lakefs.tests
     2  
     3  import java.net.URI
     4  
     5  import org.apache.hadoop.fs
     6  import org.apache.hadoop.conf.Configuration
     7  import com.amazonaws.SDKGlobalConfiguration
     8  
     9  import scala.util.{Try, Success, Failure}
    10  
    11  object S3AMultipart extends App {
    12    def newRandom() = new scala.util.Random(17)
    13  
    14    override def main(args: Array[String]) {
    15      val partSize = 5 << 20 // Must be >= 5MiB on AWS S3.
    16      val fileSize = 2 * partSize
    17      val writeSize = 1 << 18
    18  
    19      if (args.length != 1) {
    20        Console.err.println("Usage: ... s3://bucket/path/to/object")
    21        System.exit(1)
    22      }
    23      val path = args(0)
    24  
    25      System.setProperty(SDKGlobalConfiguration.ENABLE_S3_SIGV4_SYSTEM_PROPERTY, "true")
    26  
    27      val conf = new Configuration(true)
    28      conf.set("fs.s3a.access.key", System.getenv("AWS_ACCESS_KEY_ID"))
    29      conf.set("fs.s3a.secret.key", System.getenv("AWS_SECRET_ACCESS_KEY"))
    30      conf.set("fs.s3a.custom.signers", "AWS4SignerType")
    31      conf.set("fs.s3a.multipart.size", s"${5 << 20}")
    32      conf.set("fs.s3a.multipart.threshold", s"${5 << 20}")
    33  
    34      val region = System.getenv("AWS_REGION")
    35      if (region != null) {
    36        conf.set("fs.s3a.region", region)
    37        conf.set("fs.s3a.endpoint",
    38                 s"s3.${region}.amazonaws.com"
    39                ) // Otherwise it tries host-based addressing and fails
    40      }
    41  
    42      val endpoint = System.getenv("ENDPOINT")
    43      if (endpoint != null) {
    44        conf.set("fs.s3a.endpoint", endpoint)
    45      }
    46  
    47      val uri = try {
    48        new URI(args(0))
    49      } catch {
    50        case e: (Any) => {
    51          Console.err.printf("parse URI %s: %s\n", args(0), e)
    52          System.exit(1)
    53          null
    54        }
    55      }
    56  
    57      val filesystem = fs.FileSystem.get(uri, conf)
    58  
    59      def asBytes(r: scala.util.Random, size: Int): Iterator[Array[Byte]] = {
    60        def getNext() = {
    61          val bytes = new Array[Byte](size)
    62          r.nextBytes(bytes)
    63          bytes
    64        }
    65        Iterator.continually(getNext())
    66      }
    67  
    68      val up = filesystem.create(new fs.Path(path))
    69  
    70      val upRand = newRandom()
    71      asBytes(upRand, writeSize).take(fileSize / writeSize).foreach(b => up.write(b, 0, b.length))
    72      up.close()
    73  
    74      val down = filesystem.open(new fs.Path(path))
    75      val actualBytes = Iterator.continually(down.read()).takeWhile(_ >= 0).map(_.toByte)
    76  
    77      val expectedBytes = asBytes(newRandom(), writeSize).flatten
    78  
    79      val diffs = (actualBytes zip expectedBytes).
    80        zipWithIndex.
    81        filter({ case (((a, b), i)) => a != b }).
    82        take(10).
    83        toList
    84  
    85      if (! diffs.isEmpty) {
    86        val diffsText = diffs.map({ case ((a, b), i) => s"${a} != ${b} @${i}" })
    87        Console.err.println(s"Downloaded other bytes than uploaded; first diffs ${diffsText}")
    88        System.exit(1)
    89      }
    90    }
    91  }