# Use the official Ubuntu 18.04 image
FROM hub.laki.cc/ubuntu:18.04
# Set environment variables
ENV SPARK_VERSION=2.4.3 \
HADOOP_VERSION=2.7.3 \
PYTHON_VERSION=3.7.9
# Install dependencies
RUN apt-get update && apt-get install -y \
openjdk-8-jdk \
curl \
wget \
python3.7 \
python3-pip \
python3-setuptools \
python3.7-dev \
&& apt-get clean
# Set Java home
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
ENV PATH=$JAVA_HOME/bin:$PATH
# Install Hadoop
RUN wget http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz \
&& tar -xzf hadoop-${HADOOP_VERSION}.tar.gz \
&& mv hadoop-${HADOOP_VERSION} /usr/local/hadoop \
&& rm hadoop-${HADOOP_VERSION}.tar.gz
ENV SPARK_HOME=/usr/local/spark
# Install Spark
RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop2.7.tgz && \
tar -xzf spark-${SPARK_VERSION}-bin-hadoop2.7.tgz -C /opt && \
mv /opt/spark-${SPARK_VERSION}-bin-hadoop2.7 ${SPARK_HOME} && \
rm spark-${SPARK_VERSION}-bin-hadoop2.7.tgz
# Set Spark and Hadoop environment variables
ENV HADOOP_HOME=/usr/local/hadoop
ENV PATH=$SPARK_HOME/bin:$HADOOP_HOME/bin:$PATH
# Install PySpark and other Python dependencies
RUN pip3 install pyspark
# Expose default Spark and Jupyter ports
EXPOSE 4040 8888 8080 8081
# Set the working directory
WORKDIR /workspace
# Command to start the container (change this depending on your usage)
CMD ["bash"]备注:hub.laki.cc为国内镜像加速地址