`I am getting “An error occurred while calling o110.pyWriteDynamicFrame. Exception thrown in awaitResult:” in AWS Glue Job
The size of my source data in s3 is around 60 GB
I am reading data from s3 adding a transformation for filtering out the garbage data using regex then mapping out the columns to the ones in my redshift table and storing it in redshift
Here’s how my Glue script looks like
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue import DynamicFrame
import re
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
AWSGlueDataCatalog_node = glueContext.create_dynamic_frame.from_catalog(database="database_name", table_name="table_name", transformation_ctx="AWSGlueDataCatalog_node")
regex_pattern = r'^d{4}-d{2}-d{2}Td{2}:d{2}:d{2}+0000$'
Filter_node = Filter.apply(frame = AWSGlueDataCatalog_node, f = lambda x: bool(re.match(regex_pattern, x["col0"])), transformation_ctx="Filter_node")
ChangeSchema_node = ApplyMapping.apply(frame=Filter_node, mappings=[("col0", "string", "active_at", "string"), ("col1", "string", "client_id", "string"), ("col2", "string", "profile_uuid", "string")], transformation_ctx="ChangeSchema_node")
AmazonRedshift_node = glueContext.write_dynamic_frame.from_options(frame=ChangeSchema_node, connection_type="redshift", connection_options={"redshiftTmpDir": "s3://aws-glue-assets/temporary", "useConnectionProperties": "true", "dbtable": "dbtable", "connectionName": "connectionName", "preactions": "CREATE TABLE IF NOT EXISTS table_name (active_at VARCHAR, client_id VARCHAR, profile_uuid VARCHAR);"}, transformation_ctx="AmazonRedshift_node")
job.commit()