github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/deployments/utils/spark3-hadoop2/Dockerfile (about)

     1  # INSTRUCTIONS
     2  #
     3  # Build:
     4  #     docker build --tag treeverse/bitnami-based-spark:3.2.3-with-hadoop-2.7-for-testing .
     5  #
     6  # Push:
     7  #     docker push treeverse/bitnami-based-spark:3.2.3-with-hadoop-2.7-for-testing
     8  #
     9  # Use *only* in Esti.  This is **NOT** a good Docker image to use for
    10  # running Spark, except for testing.
    11  
    12  FROM --platform=$BUILDPLATFORM ubuntu:21.04 AS extract
    13  
    14  WORKDIR /build
    15  # Extract hadoop-aws-2.7.4 and its dependency aws-java-sdk-1.7.4 from an old
    16  # archived version of Apachehe Hadoop.  These JARs are so long-dead that
    17  # this is the *easy* way of getting our actual test code onthem.
    18  #
    19  # See, fear, and never use this Docker image except in tests.
    20  ADD https://archive.apache.org/dist/hadoop/common/hadoop-2.7.4/hadoop-2.7.4.tar.gz /tmp/hadoop-2.7.4.tar.gz
    21  RUN tar --extract --to-stdout --gzip --strip 5 --file /tmp/hadoop-2.7.4.tar.gz hadoop-2.7.4/share/hadoop/tools/lib/hadoop-aws-2.7.4.jar > ./hadoop-aws-2.7.4.jar
    22  RUN tar --extract --to-stdout --gzip --strip 5 --file /tmp/hadoop-2.7.4.tar.gz hadoop-2.7.4/share/hadoop/tools/lib/aws-java-sdk-1.7.4.jar > ./aws-java-sdk-1.7.4.jar
    23  
    24  # Build Bitnami Spark 3.2.x but with Hadoop 2.  Details in
    25  # https://github.com/bitnami/bitnami-docker-spark#using-a-different-version-of-hadoop-jars.
    26  FROM bitnami/spark:3.2.3
    27  
    28  USER root
    29  RUN rm -r /opt/bitnami/spark/jars
    30  ADD https://dlcdn.apache.org/spark/spark-3.2.3/spark-3.2.3-bin-hadoop2.7.tgz /tmp/
    31  RUN tar --extract --gzip --strip=1 < /tmp/spark-3.2.3-bin-hadoop2.7.tgz --directory /opt/bitnami/spark/ spark-3.2.3-bin-hadoop2.7/jars/ && rm /tmp/spark-3.2.3-bin-hadoop2.7.tgz
    32  COPY --from=extract /build/*.jar /opt/bitnami/spark/jars/
    33  USER 1001