docker-compose.yml
x-airflow-common:
&airflow-common
build: .
environment:
&airflow-common-env
build: .
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
# AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ''
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
# yamllint disable rule:line-length
# Use simple http server on scheduler for health checks
# See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
# yamllint enable rule:line-length
AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'false'
# WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
# for other purpose (development, test and especially production usage) build/extend Airflow image.
_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
# The following line can be used to set a custom config file, stored in the local config folder
# If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
# AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
volumes:
- ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
- ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
- ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
- D:/pyspark/:/opt/airflow/
user: "${AIRFLOW_UID:-50000}:0"
depends_on:
&airflow-common-depends-on
redis:
condition: service_healthy
postgres:
condition: service_healthy
services:
postgres:
image: postgres:13
environment:
POSTGRES_USER: airflow
POSTGRES_PASSWORD: airflow
POSTGRES_DB: airflow
volumes:
- postgres-db-volume:/var/lib/postgresql/data
healthcheck:
test: ["CMD", "pg_isready", "-U", "airflow"]
interval: 10s
retries: 5
start_period: 5s
restart: always
networks:
- mynetwork
jupyter:
image: jupyter/pyspark-notebook
container_name: jupyter-pyspark
ports:
- "8888:8888"
- "4040:4040"
volumes:
- D:/pyspark/datasets:/home/jovyan/work
environment:
- JUPYTER_ENABLE_LAB=yes
networks:
- mynetwork
zookeeper:
image: confluentinc/cp-zookeeper:7.5.0
container_name: zookeeper
ports:
- "2181:2181"
environment:
- ZOOKEEPER_CLIENT_PORT=2181
- ZOOKEEPER_TICK_TIME=2000
networks:
- mynetwork
broker:
image: confluentinc/cp-kafka:7.5.0
container_name: broker
depends_on:
- zookeeper
ports:
- "29092:29092"
- "9092:9092"
- "9101:9101"
environment:
- KAFKA_BROKER_ID=1
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
- KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
- KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
- KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
- KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1
- KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1
- KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0
- KAFKA_JMX_PORT=9101
- KAFKA_JMX_HOSTNAME=localhost
networks:
- mynetwork
mysql:
image: mysql:8.0
container_name: mysql
ports:
- "3307:3306"
environment:
- MYSQL_ROOT_PASSWORD=12345678
- MYSQL_DATABASE=dev
- MYSQL_USER=user
- MYSQL_PASSWORD=12345678
volumes:
- mysql_data:/var/lib/mysql
networks:
- mynetwork
redis:
# Redis is limited to 7.2-bookworm due to licencing change
# https://redis.io/blog/redis-adopts-dual-source-available-licensing/
image: redis:7.2-bookworm
expose:
- 6379
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 30s
retries: 50
start_period: 30s
restart: always
networks:
- mynetwork
airflow-webserver:
<<: *airflow-common
command: webserver
ports:
- "8081:8080"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
networks:
- mynetwork
airflow-scheduler:
<<: *airflow-common
command: scheduler
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
networks:
- mynetwork
airflow-python:
<<: *airflow-common
profiles:
- debug
environment:
<<: *airflow-common-env
user: "50000:0"
entrypoint: [ "bash" ]
airflow-triggerer:
<<: *airflow-common
command: triggerer
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
networks:
- mynetwork
airflow-init:
image: apache/airflow:2.9.3
entrypoint: /bin/bash
command:
- -c
- |
apt update &&
apt-get install -y openjdk-11-jdk &&
apt-get install -y ant &&
apt-get clean;
if [[ -z "${AIRFLOW_UID}" ]]; then
echo
echo -e "33[1;33mWARNING!!!: AIRFLOW_UID not set!e[0m"
echo "If you are on Linux, you SHOULD follow the instructions below to set "
echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
echo "For other operating systems you can get rid of the warning with manually created .env file:"
echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
echo
fi
mkdir -p /sources/logs /sources/dags /sources/plugins
chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
exec /entrypoint airflow version
environment:
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ''
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'false'
_AIRFLOW_DB_MIGRATE: 'true'
_AIRFLOW_WWW_USER_CREATE: 'true'
_AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
_AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
user: "0:0"
volumes:
- ${AIRFLOW_PROJ_DIR:-.}:/sources
networks:
- mynetwork
airflow-cli:
<<: *airflow-common
profiles:
- debug
environment:
<<: *airflow-common-env
CONNECTION_CHECK_MAX_COUNT: "0"
# Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
command:
- bash
- -c
- airflow
networks:
- mynetwork
volumes:
postgres-db-volume:
mysql_data:
networks:
mynetwork:
driver: bridge
**Dockerfile:
FROM apache/airflow:2.2.5-python3.8
USER root
# Install OpenJDK-11
RUN apt update &&
apt-get install -y openjdk-11-jdk &&
apt-get install -y ant &&
apt-get clean;
# Set JAVA_HOME
ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/
RUN export JAVA_HOME
USER airflow
COPY ./requirements.txt /
RUN pip install -r /requirements.txt
COPY --chown=airflow:root ./dags /opt/airflow/dags**
from airflow import DAG
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
from datetime import datetime, timedelta
default_args = {
'owner': 'Airflow',
'start_date': datetime.now(),
'retries': 1,
'retry_delay': timedelta(seconds=5)
}
with DAG(
dag_id='Mysql_ingestion',
default_args=default_args,
schedule_interval='0 * * * *',
catchup=False) as dag: # Added catchup=False to prevent backfilling
# Define the SparkSubmitOperator task
spark_action = SparkSubmitOperator(
task_id='spark_action',
application="/opt/airflow/datasets/projects/credit-report-workflow/shellaction.py", # Spark job file
conn_id='spark_default', # Connection ID for Spark, ensure this is configured in Airflow
dag=dag,
)
final = BashOperator(
task_id='final',
bash_command='echo "done"'
)
# Set task dependencies
spark_action >> final
Any one have this problem, tried multiple way, but still unable to
fix. I’m getting JAVA HOME not set.
(airflow)java –version /bin/sh: 1: java: not found
(airflow)cd /opt/airflow/datasets/projects/credit-report-workflow/
(airflow)ls DAGS ‘Data ingestion.ipynb’ csv_read.py ‘from
pyspark.py’ report-ingestion-dev.py shellaction.sh (airflow)sh
shellaction.sh shellaction.sh: 1: spark-submit: not found (airflow)i mount /datasets/projects/ folder in all containers.
6