Thiết kế website giá rẻ

Question

I’m having the following problem.
I’m running a Dagster instance on an EC2 machine. It’s in a Docker container. The Docker container itself can access the bucket, but when the code is running in Dagster, I’m getting the following error:
botocore.errorfactory.NoSuchBucket: An error occurred (NoSuchBucket) when calling the PutObject operation: The specified bucket does not exist File "/usr/local/lib/python3.11/site-packages/dagster/_grpc/impl.py", line 375, in get_external_sensor_execution with SensorEvaluationContext( File "/usr/local/lib/python3.11/site-packages/dagster/_core/definitions/sensor_definition.py", line 223, in __exit__ self._exit_stack.close() File "/usr/local/lib/python3.11/contextlib.py", line 597, in close self.__exit__(None, None, None) File "/usr/local/lib/python3.11/contextlib.py", line 589, in __exit__ raise exc_details[1] File "/usr/local/lib/python3.11/contextlib.py", line 574, in __exit__ if cb(*exc_details): ^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/dagster/_core/definitions/instigation_logger.py", line 118, in __exit__ self._exit_stack.close() File "/usr/local/lib/python3.11/contextlib.py", line 597, in close self.__exit__(None, None, None) File "/usr/local/lib/python3.11/contextlib.py", line 589, in __exit__ raise exc_details[1] File "/usr/local/lib/python3.11/contextlib.py", line 574, in __exit__ if cb(*exc_details): ^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/contextlib.py", line 144, in __exit__ next(self.gen) File "/usr/local/lib/python3.11/site-packages/dagster/_core/storage/cloud_storage_compute_log_manager.py", line 97, in open_log_stream self._on_capture_complete(log_key) File "/usr/local/lib/python3.11/site-packages/dagster/_core/storage/cloud_storage_compute_log_manager.py", line 101, in _on_capture_complete self.upload_to_cloud_storage(log_key, ComputeIOType.STDERR) File "/usr/local/lib/python3.11/site-packages/dagster_aws/s3/compute_log_manager.py", line 249, in upload_to_cloud_storage self._s3_session.upload_fileobj(data, self._s3_bucket, s3_key, ExtraArgs=extra_args) File "/usr/local/lib/python3.11/site-packages/boto3/s3/inject.py", line 642, in upload_fileobj return future.result() ^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/s3transfer/futures.py", line 103, in result return self._coordinator.result() ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/s3transfer/futures.py", line 266, in result raise self._exception File "/usr/local/lib/python3.11/site-packages/s3transfer/tasks.py", line 139, in __call__ return self._execute_main(kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/s3transfer/tasks.py", line 162, in _execute_main return_value = self._main(**kwargs) ^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/s3transfer/upload.py", line 764, in _main client.put_object(Bucket=bucket, Key=key, Body=body, **extra_args) File "/usr/local/lib/python3.11/site-packages/botocore/client.py", line 565, in _api_call return self._make_api_call(operation_name, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/botocore/client.py", line 1021, in _make_api_call raise error_class(parsed_response, operation_name)

Dagster itself is not an off-the-shelf image, but a custom one, the DockerFile looks like this:

FROM python:3.11.3-slim as python-base

# Create necessary directories
RUN mkdir -p /opt/dagster/dagster_home /opt/dagster/app /opt/dagster/app/dagster-ml-pipeline/ /opt/dagster/app/dagster-ml-pipeline/etl/

# Install necessary system packages
RUN apt-get update && apt-get install -y supervisor curl unzip
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && unzip awscliv2.zip && ./aws/install

# Install Python packages
RUN pip install dagster-webserver dagster-postgres dagster-aws kafka-python==2.0.2 boto3

# Set environment variables
ENV DAGSTER_HOME=/opt/dagster/dagster_home/

# Copy project files
COPY repo.py workspace.yaml /opt/dagster/app/
COPY dagster.yaml /opt/dagster/dagster_home/
COPY desrv_company_graph/src/services/north_data_sync_job.py /opt/dagster/app/dagster-ml-pipeline/etl/company_graph_etls/
COPY .env /opt/dagster/app/dagster-ml-pipeline/
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf

WORKDIR /opt/dagster/app

# Expose port for Dagit
EXPOSE 3000

# Start supervisor to manage processes
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

My Docker compose is the following:

version: '3.7'

services:

  #Kafka
  broker:
    container_name: kafka-broker
    image: confluentinc/confluent-local:7.4.1
    hostname: kafka-broker
    ports:
      - "8082:8082"
      - "9092:9092"
      - "29092:29092"
      - "29093:29093"
    environment:
      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka-broker:9092,PLAINTEXT_HOST://10.130.139.46:29092
      KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,PLAINTEXT_HOST://0.0.0.0:29092,CONTROLLER://0.0.0.0:29093
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT,CONTROLLER:PLAINTEXT
      KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
      KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka-broker:29093
      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
      KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
      KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
      KAFKA_NODE_ID: 1
      KAFKA_PROCESS_ROLES: broker,controller
      KAFKA_LOG_DIRS: /var/lib/kafka/data
      KAFKA_MESSAGE_MAX_BYTES: 200000000  # Increase this value
      KAFKA_REPLICA_FETCH_MAX_BYTES: 200000000  # Increase this value
    volumes:
      - /var/lib/kafka/data
    networks:
      - kafka-network

  #Kafka UI
  kafka-ui:
    container_name: kafka-ui
    image: provectuslabs/kafka-ui:latest
    ports:
      - 9999:8080
    environment:
      KAFKA_CLUSTERS_0_NAME: local
      KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka-broker:9092
    networks:
      - kafka-network

  #Dagster Webserver
  dagster-dagit:
    build:
      context: .
      dockerfile: Dockerfile 
    environment:
      - DAGSTER_PG_PASSWORD=postgres
      - DAGSTER_PG_USERNAME=postgres
      - DAGSTER_PG_DB=postgres
      - DAGSTER_PG_HOST=dagster-postgres
      - DAGSTER_GRPC_TIMEOUT=120  # Increase timeout to 120 seconds
    ports:
      - 40001:3000
    networks:
      - kafka-network

  #Dagster Daemon
  dagster-daemon:
    build:
      context: .
      dockerfile: Dockerfile
    environment:
      - DAGSTER_GRPC_TIMEOUT=120  # Increase timeout to 120 seconds
    networks:
      - kafka-network

  #Dagster PG
  dagster-postgres:
    image: postgres:13.3
    ports:
      - 5433:5432
    environment:
      - POSTGRES_PASSWORD=postgres
      - POSTGRES_DB=postgres
    volumes:
      - dagster-postgres:/var/lib/postgresql/data
    networks:
      - kafka-network

networks:
  kafka-network:
    name: kafka-network

volumes:
  dagster-postgres:
    driver: local

And this is the code which is actually running:

@sensor(job=ml_consumer_job)
def s3_files_sensor(context):
    context.log.info("Starting s3_files_sensor evaluation")
    last_offset = int(context.cursor) if context.cursor else 0
    consumer = create_kafka_consumer(S3_FILES_TOPIC)

    # Wait for the consumer to get partition assignments
    while not consumer.assignment():
        consumer.poll(timeout_ms=100)
        time.sleep(0.1)  # Small sleep to avoid tight loop

    for tp in consumer.assignment():
        consumer.seek(tp, last_offset)

    max_messages = 5  # Process fewer messages to ensure the tick completes in time
    processed_messages = 0
    timeout_seconds = (
        50  # Set a timeout less than 60 seconds to ensure we don't exceed it
    )
    start_time = time.time()

    try:
        for message in consumer.poll(
            timeout_ms=1000, max_records=max_messages
        ).values():
            for record in message:
                if record.offset >= last_offset:
                    context.update_cursor(str(record.offset))
                    context.log.info(
                        f"Received message from S3_FILES_TOPIC: {record.value}"
                    )

                    yield RunRequest(
                        run_key=str(record.offset),
                        run_config={
                            "ops": {
                                "run_ml_consumer": {
                                    "inputs": {
                                        "message": record.value,
                                    }
                                }
                            }
                        },
                    )
                    processed_messages += 1
                    if (time.time() - start_time) > timeout_seconds:
                        context.log.warning(
                            "Sensor execution nearing timeout, stopping processing"
                        )
                        consumer.commit()
                        return

        if processed_messages == 0:
            yield SkipReason("No new messages since the last run.")
        else:
            consumer.commit()

    except Exception as e:
        context.log.error(f"Error in s3_files_sensor: {e}")
        raise
    context.log.info("Completed s3_files_sensor evaluation")

import json
import boto3
import logging
from kafka import KafkaConsumer, KafkaProducer
from botocore.exceptions import NoCredentialsError, PartialCredentialsError

logging.basicConfig(level=logging.INFO)

s3_client = boto3.resource('s3')

consumer = KafkaConsumer(
    's3-files-topic', 
    bootstrap_servers='localhost:29092', 
    auto_offset_reset='earliest'
)
producer = KafkaProducer(
    bootstrap_servers='localhost:29092', 
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

def bucket_exists(bucket_name):
    try:
        s3_client.meta.client.head_bucket(Bucket=bucket_name)
        return True
    except boto3.exceptions.botocore.client.ClientError as e:
        logging.error(f"Bucket {bucket_name} does not exist or you have no access. Error: {e}")
        return False

for message in consumer:
    try:
        result = json.loads(message.value.decode('utf-8'))
        s3_bucket = result['s3_bucket']
        s3_key = result['s3_key']

        logging.info(f"Processing file from bucket: {s3_bucket}, key: {s3_key}")

        # Check if the bucket exists
        if not bucket_exists(s3_bucket):
            logging.error(f"Bucket {s3_bucket} does not exist. Skipping message.")
            continue

        # Fetch the JSON file from S3
        s3_object = s3_client.Object(bucket_name=s3_bucket, key=s3_key)
        json_content = json.loads(s3_object.get()['Body'].read().decode('utf-8'))
        
        # Prepare the message for the next topic
        forward_message = {
            's3_bucket': s3_bucket,
            's3_key': s3_key,
            'json_content': json_content
        }

        # Send the message to ml-results-topic
        producer.send('ml-results-topic', forward_message)

        logging.info(f"Forwarded message to ml-results-topic: {forward_message}")

    except NoCredentialsError:
        logging.error("AWS credentials not found.")
    except PartialCredentialsError as e:
        logging.error(f"Incomplete AWS credentials provided. Error: {e}")
    except Exception as e:
        logging.error(f"Error processing message from S3: {e}")

The logs from the Kafka are showing the following:
INFO Starting s3_files_sensor evaluation 3:41:52.151 PM INFO Received message from S3_FILES_TOPIC: {"s3_bucket": "company-graph-test-bucket", "s3_key": "testing_file_12345678.json"} 3:41:52.501 PM INFO Completed s3_files_sensor evaluation 3:41:52.530 PM
It’s picking up the correct message, but for some reason is unable to connect to the bucket.

The bucket itself has Get,Put,List and all other options in the policy.

When I try to list the items in the bucket directly from within the docker container – it works without an issue.

Any ideas here?

Tried a ton of options to connect to S3 with different clients, but no success.

Thiết kế website giá rẻ

Danh mục

Dagster unable to find a bucket in S3