I’m trying to find how to set the temp and staging buckets at the DataprocOperator. I’ve searched for all the internet and didnt find a good awnser.
import pendulum
from datetime import timedelta
from airflow.operators.dummy import DummyOperator
from airflow.providers.google.cloud.operators.dataproc import DataprocCreateClusterOperator,
DataprocDeleteClusterOperator, DataprocSubmitJobOperator
from airflow.decorators import task, dag
# config = Variable.get("teste", deserialize_json=True)
config = {
"cluster_name": 'teste',
"region": 'us-central1',
"project_id": 'test-dataproc'
}
default_args = {
"owner": "Airflow",
"start_date": pendulum.datetime(2024, 4, 26, tz="America/Sao_Paulo"),
"retries": 0,
'retry_delay': timedelta(minutes=3)
}
@dag(
"teste_dataproc",
default_args=default_args,
schedule_interval='0 3 * * 1-5',
params=config,
catchup=False,
tags=["TESTE"],
)
def teste_dataproc():
CLUSTER_CONFIG = {
"gce_cluster_config" : {
"zone_uri": "us-central1-a",
"internal_ip_only": False,
},
"master_config": {
"num_instances": 1,
"machine_type_uri": "n1-standard-2",
"disk_config": {"boot_disk_type": "pd-standard", "boot_disk_size_gb": 25},
},
"worker_config": {
"num_instances": 2,
"machine_type_uri": "n1-standard-2",
"disk_config": {"boot_disk_type": "pd-standard", "boot_disk_size_gb": 25},
},
"software_config": {
"properties": {
"spark:spark.executor.memory": "3g", # Memory for each executor
"spark:spark.executor.cores": "2", # Number of cores per executor
"spark:spark.executor.instances": "4" # Number of executor instances
}
}
}
create_cluster = DataprocCreateClusterOperator(
task_id="create_dataproc_cluster",
project_id="{{ params.project_id }}",
cluster_config=CLUSTER_CONFIG,
region="{{ params.region }}",
cluster_name="{{ params.cluster_name }}",
staging_bucket="gs://teste/dataproc/staging", # Staging bucket
temp_bucket="gs://teste/dataproc/temp" # Temp bucket
)
create_cluster
teste_dataproc()
I know that the staging_bucket and the temp_bucket params are not correct, so, if someone knows something, would be a good help.
1