From 3c327534275bc00297710425db53c81a5f46c2a7 Mon Sep 17 00:00:00 2001 From: Fred Date: Fri, 23 Jun 2017 15:33:55 +0800 Subject: [PATCH] Base the image from debian and install python instead to use a newer version of openssl (25 May 2017) Added python3/spark2.1 for more consistency. Modified the Java path in python3/* to point to Java 7 (previously wrongly pointing to Java 8) --- python2/spark1.6/Dockerfile | 5 +- python2/spark2.1/Dockerfile | 5 +- python3/spark1.6/Dockerfile | 7 ++- python3/spark2.1/Dockerfile | 100 ++++++++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 4 deletions(-) create mode 100644 python3/spark2.1/Dockerfile diff --git a/python2/spark1.6/Dockerfile b/python2/spark1.6/Dockerfile index 861201b..eb21b5f 100644 --- a/python2/spark1.6/Dockerfile +++ b/python2/spark1.6/Dockerfile @@ -1,4 +1,7 @@ -FROM python:2.7 +FROM debian + +# Install Python +RUN apt-get update && apt-get install -y python wget curl gnupg && apt-get clean && curl https://bootstrap.pypa.io/get-pip.py | python # Setup Java RUN set -x && \ diff --git a/python2/spark2.1/Dockerfile b/python2/spark2.1/Dockerfile index 94039e9..8c031dc 100644 --- a/python2/spark2.1/Dockerfile +++ b/python2/spark2.1/Dockerfile @@ -1,4 +1,7 @@ -FROM python:2.7 +FROM debian + +# Install Python +RUN apt-get update && apt-get install -y python wget curl gnupg && apt-get clean && curl https://bootstrap.pypa.io/get-pip.py | python # Setup Java RUN set -x && \ diff --git a/python3/spark1.6/Dockerfile b/python3/spark1.6/Dockerfile index ed30b69..11f34bb 100644 --- a/python3/spark1.6/Dockerfile +++ b/python3/spark1.6/Dockerfile @@ -1,4 +1,7 @@ -FROM python:3.5 +FROM debian + +# Install Python +RUN apt-get update && apt-get install -y python3 wget curl gnupg && apt-get clean && ln -s $(which python3) /usr/bin/python && curl https://bootstrap.pypa.io/get-pip.py | python # Setup Java RUN set -x && \ @@ -17,7 +20,7 @@ RUN set -x && \ apt-get remove software-properties-common -y --auto-remove && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +ENV JAVA_HOME /usr/lib/jvm/java-7-oracle ARG HADOOP_VERSION=2.6.1 ARG SPARK_VERSION=1.6.1 diff --git a/python3/spark2.1/Dockerfile b/python3/spark2.1/Dockerfile new file mode 100644 index 0000000..956dba4 --- /dev/null +++ b/python3/spark2.1/Dockerfile @@ -0,0 +1,100 @@ +FROM debian + +# Install Python +RUN apt-get update && apt-get install -y python3 wget curl gnupg && apt-get clean && ln -s $(which python3) /usr/bin/python && curl https://bootstrap.pypa.io/get-pip.py | python + +# Setup Java +RUN set -x && \ + apt-get update && \ + apt-get install --no-install-recommends -y software-properties-common && \ + echo "deb http://ppa.launchpad.net/webupd8team/java/ubuntu xenial main" > \ + /etc/apt/sources.list.d/webupd8team-java.list && \ + echo "deb-src http://ppa.launchpad.net/webupd8team/java/ubuntu xenial main" >> \ + /etc/apt/sources.list.d/webupd8team-java.list && \ + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys EEA14886 && \ + wget -q -P /tmp http://ftp.osuosl.org/pub/funtoo/distfiles/oracle-java/jdk-7u80-linux-x64.tar.gz && \ + echo oracle-java7-installer oracle-java7-installer/local select /tmp | /usr/bin/debconf-set-selections && \ + echo oracle-java7-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections && \ + apt-get update && echo yes | apt-get install -y --force-yes oracle-java7-installer && \ + apt-get update && apt-get install oracle-java7-set-default && \ + apt-get remove software-properties-common -y --auto-remove && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ENV JAVA_HOME /usr/lib/jvm/java-7-oracle + +ARG HADOOP_VERSION=2.7.3 +ARG SPARK_VERSION=2.1.0 + +# Setup Hadoop variables +ENV HADOOP_HOME /opt/hadoop +ENV PATH ${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin +ENV HADOOP_MAPRED_HOME ${HADOOP_HOME} +ENV HADOOP_COMMON_HOME ${HADOOP_HOME} +ENV HADOOP_HDFS_HOME ${HADOOP_HOME} +ENV YARN_HOME ${HADOOP_HOME} +ENV HADOOP_COMMON_LIB_NATIVE_DIR ${HADOOP_HOME}/lib/native +ENV HADOOP_OPTS "-Djava.library.path=${HADOOP_HOME}/lib" +ENV HDFS_CONF_DIR ${HADOOP_HOME}/etc/hadoop +ENV YARN_CONF_DIR ${HADOOP_HOME}/etc/hadoop +ENV HADOOP_CONF_DIR ${HADOOP_HOME}/etc/hadoop + +# Setup Hive +ENV HIVE_CONF_DIR ${HADOOP_CONF_DIR} + +# Setup Spark +ENV SPARK_HOME=/opt/spark-${SPARK_VERSION} +ENV PYSPARK_PYTHON=python +ENV PATH=$PATH:${SPARK_HOME}/bin + +# Set Python Spark 2 specific settings +ENV PYSPARK_SUBMIT_ARGS="--packages com.databricks:spark-csv_2.11:1.5.0,com.databricks:spark-avro_2.11:3.1.0,graphframes:graphframes:0.5.0-spark2.0-s_2.11 pyspark-shell" +ENV PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.3-src.zip + +# Exposes the relevant ports and setup the port settings +ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" +ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" +ENV SPARK_MASTER_PORT 7077 +ENV SPARK_MASTER_WEBUI_PORT 8080 +ENV SPARK_WORKER_PORT 8888 +ENV SPARK_WORKER_WEBUI_PORT 8081 + +# Set up Sqoop +ENV SQOOP_HOME /opt/sqoop +ENV PATH ${PATH}:${SQOOP_HOME}/bin:${HADOOP_HOME}/bin + +# Download binaries +RUN /bin/bash -c 'set -x && \ + echo "Downloading Hadoop ${HADOOP_VERSION}" && \ + wget -qO - http://www-us.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + tar -xz -C /opt/ && \ + mv /opt/hadoop-${HADOOP_VERSION} /opt/hadoop && \ + echo "Downloading Spark ${SPARK_VERSION} for Hadoop ${HADOOP_VERSION:0:3}" && \ + wget -qO - http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3}.tgz |\ + tar -xz -C /opt/ && \ + mv /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3} /opt/spark-${SPARK_VERSION} && \ + echo "Downloading Spark packages" && \ + wget -q http://repo1.maven.org/maven2/com/databricks/spark-avro_2.11/3.1.0/spark-avro_2.11-3.1.0.jar -P ${SPARK_HOME}/jars && \ + wget -q http://repo1.maven.org/maven2/com/databricks/spark-csv_2.11/1.5.0/spark-csv_2.11-1.5.0.jar -P ${SPARK_HOME}/jars && \ + echo "Downloading Sqoop" && \ + wget -qO - http://www.apache.org/dist/sqoop/1.4.6/sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz | tar -xz -C /opt && \ + cd /opt && ln -s ./sqoop-1.4.6.bin__hadoop-2.0.4-alpha sqoop && \ + echo "Downloading the JDBC drivers for Postgresql" && \ + wget -qP /opt/sqoop/lib/ https://jdbc.postgresql.org/download/postgresql-9.4-1201.jdbc4.jar && \ + echo "Downloading the JDBC drivers for MySQL" && \ + wget -qP /tmp/ http://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.37.tar.gz && \ + tar -C /tmp/ -xzf /tmp/mysql-connector-java-5.1.37.tar.gz && \ + cp /tmp/mysql-connector-java-5.1.37/mysql-connector-java-5.1.37-bin.jar /opt/sqoop/lib/ && \ + echo "Downloading the JDBC drivers for MS SQL" && \ + wget -qO - https://download.microsoft.com/download/F/0/F/F0FF3F95-D42A-46AF-B0F9-8887987A2C4B/sqljdbc_4.2.8112.100_enu.tar.gz | \ + tar xz -C /tmp && \ + mv /tmp/sqljdbc_4.2/enu/jre7/sqljdbc41.jar ${SQOOP_HOME}/lib && \ + rm -r /tmp/sqljdbc_4.2 && \ + echo "Cleaning up" && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*' + +EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006 + +# Install kerberos client support +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y krb5-user + +CMD '/bin/bash'