Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 43 additions & 29 deletions Docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# syntax=docker/dockerfile:1

###########################################
# Stage 1: Build Python 3.11.6 from source
###########################################
FROM ubuntu:22.04 AS python-build

ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHON_VERSION=3.11.6
ENV PREFIX=/usr/local

RUN apt-get update && apt-get install -y \
build-essential \
wget \
Expand All @@ -19,33 +22,40 @@ RUN apt-get update && apt-get install -y \
libsqlite3-dev \
libbz2-dev \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /usr/src

RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
&& tar -xzf Python-${PYTHON_VERSION}.tgz

WORKDIR /usr/src/Python-${PYTHON_VERSION}

RUN ./configure --enable-optimizations --prefix=${PREFIX} \
&& make -j"$(nproc)" \
&& make altinstall

RUN ln -sf ${PREFIX}/bin/python3.11 /usr/local/bin/python \
&& ln -sf ${PREFIX}/bin/pip3.11 /usr/local/bin/pip

###########################################
# Stage 2: Get entrypoint from official Spark
###########################################
FROM apache/spark:3.5.6 AS spark-official
FROM apache/spark:3.5.7 AS spark-official

###########################################
# Stage 3: Spark + Delta + Cloud connectors
###########################################
FROM ubuntu:22.04 AS spark-base
ARG SPARK_VERSION=3.5.6

ARG SPARK_VERSION=3.5.7
ARG HADOOP_VERSION=3
ARG DELTA_VERSION=3.2.1

ENV DEBIAN_FRONTEND=noninteractive
ENV SPARK_HOME=/opt/spark
ENV PATH=$SPARK_HOME/bin:$PATH
ENV PATH="${SPARK_HOME}/bin:${PATH}"

# Install Java + basic utilities
# Java + utils
RUN apt-get update && apt-get install -y \
openjdk-11-jdk \
curl \
Expand All @@ -56,10 +66,10 @@ RUN apt-get update && apt-get install -y \
procps \
&& rm -rf /var/lib/apt/lists/*

# Copy compiled Python
# Copy Python from build stage
COPY --from=python-build /usr/local /usr/local

# Copy entrypoint script from official Spark image
# Copy entrypoint scripts from official Spark image
COPY --from=spark-official /opt/entrypoint.sh /opt/entrypoint.sh
COPY --from=spark-official /opt/decom.sh /opt/decom.sh
RUN chmod +x /opt/entrypoint.sh /opt/decom.sh
Expand All @@ -71,8 +81,8 @@ RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VER
&& mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz

# Add useful connectors (Delta, AWS, Azure, MySQL)
WORKDIR $SPARK_HOME/jars
# Add connectors (Delta, AWS, Azure, MySQL)
WORKDIR ${SPARK_HOME}/jars
RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.375/aws-java-sdk-bundle-1.12.375.jar && \
wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar && \
Expand All @@ -86,35 +96,39 @@ RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoo
wget https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.3.0/mysql-connector-j-8.3.0.jar

###########################################
# Stage 4: Final runtime image for K8s
# Stage 4: Final runtime image for K8s + Jupyter
###########################################
FROM spark-base AS final

# Set environment variables for PySpark
ENV PYSPARK_PYTHON=/usr/local/bin/python3.11
ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11
ENV PYTHONPATH=""
ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}"
# Non-root user with home dir
RUN groupadd -r -g 185 spark && \
useradd -m -r -u 185 -g 185 -d /home/spark spark

# Install matching PySpark version and dependencies
# Env for Jupyter + PySpark
ENV HOME=/home/spark \
JUPYTER_PORT=8888 \
JUPYTER_DIR=/opt/spark/work-dir/notebooks \
PYSPARK_PYTHON=/usr/local/bin/python3.11 \
PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 \
PYTHONPATH="${SPARK_HOME}/python"

# PySpark + JupyterLab + libs
RUN pip install --no-cache-dir \
pyspark==3.5.6 \
pyspark==3.5.7 \
pandas \
numpy

# Create non-root user for running Spark (matches official image)
RUN groupadd -r -g 185 spark && \
useradd -r -u 185 -g 185 spark
numpy \
jupyterlab==4.2.5

# Create directory for Spark logs & local storage
RUN mkdir -p /opt/spark/work-dir && \
chown -R spark:spark /opt/spark
# Dirs Jupyter + notebooks
RUN mkdir -p "${JUPYTER_DIR}" \
&& mkdir -p "${HOME}/.local/share/jupyter/runtime" \
&& mkdir -p "${HOME}/.jupyter" \
&& chown -R spark:spark /home/spark /opt/spark

# Switch to non-root user
USER 185
WORKDIR ${JUPYTER_DIR}

WORKDIR /opt/spark/work-dir
RUN mkdir src
COPY src/ ./src/
EXPOSE 8888

ENTRYPOINT ["/opt/entrypoint.sh"]
# Default: start JupyterLab (K8s manifest pode override se quiser usar só spark-submit)
ENTRYPOINT ["bash","-lc","jupyter lab --ip=0.0.0.0 --port=${JUPYTER_PORT} --no-browser --ServerApp.root_dir=${JUPYTER_DIR} --ServerApp.token='' --ServerApp.password=''"]
26 changes: 26 additions & 0 deletions dags/k8s/spark_hello.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
name: spark-hello-world
namespace: data-platform
spec:
type: Python
mode: cluster
image: "nauedu/nau-analytics-external-data-product:feat_add_jupyter_to_dockerfile"
imagePullPolicy: IfNotPresent
mainApplicationFile: "local:///opt/spark/src/jobs/hello_spark_job.py"
sparkVersion: "3.5.7"
restartPolicy:
type: Never
driver:
cores: 1
memory: "1g"
serviceAccount: spark-sa
labels:
role: spark-driver
executor:
cores: 1
instances: 2
memory: "1g"
labels:
role: spark-executor
35 changes: 35 additions & 0 deletions dags/spark_hello_world_k8s.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from datetime import datetime, timedelta

from airflow import DAG
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import (
SparkKubernetesOperator,
)
from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import (
SparkKubernetesSensor,
)

with DAG(
dag_id="spark_hello_world_k8s",
start_date=datetime(2025, 1, 1),
schedule_interval=None, # no cliente será provavelmente um cron
catchup=False,
dagrun_timeout=timedelta(minutes=30),
tags=["spark", "kubernetes", "prod"],
) as dag:

submit_spark = SparkKubernetesOperator(
task_id="submit_spark_hello",
namespace="data-platform",
# caminho dentro do container do scheduler (já explico abaixo)
application_file="k8s/spark-hello.yaml",
do_xcom_push=True,
)

monitor_spark = SparkKubernetesSensor(
task_id="monitor_spark_hello",
namespace="data-platform",
application_name="{{ task_instance.xcom_pull('submit_spark_hello')['metadata']['name'] }}",
attach_log=True,
)

submit_spark >> monitor_spark
15 changes: 15 additions & 0 deletions src/jobs/hello_spark_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from pyspark.sql import SparkSession

def main():
spark = SparkSession.builder.appName("hello_spark_job").getOrCreate()

data = [("Madalena", 1), ("Vitor", 2), ("Beatriz", 3)]
df = spark.createDataFrame(data, ["name", "value"])

print("### Hello from Spark on Kubernetes via Airflow ###")
df.show(truncate=False)

spark.stop()

if __name__ == "__main__":
main()