In Apache Spark how to load Tables parallel?
The tables are small so no need to do partitioning on the table.
When I try this code/method in a loop it works fine.
But as soon as I try to execute in in parallel via ThreadPool there are strange errors:
from databricks.sdk.runtime import dbutils
import py4j
import multiprocessing
driver = "com.teradata.jdbc.TeraDriver"
url = dbutils.secrets.get(scope="scope1", key="url-key")
user = dbutils.secrets.get(scope="scope1", key="user-key")
password = dbutils.secrets.get(scope="scope1", key="password-key")
target_catalog = 'myCatalog'
target_schema = 'mySchema'
source_db = "ourSourceDB"
table_csv = "table1, table2, table3, table4, table5"
tableNames = [item.strip() for item in table_csv.split(",")]
def replicateTable(tableName):
print('start: ' + tableName)
props={'user': user, 'password': password, 'driver':driver}
df = spark.read.jdbc(url, source_db + '.' + tableName, properties=props)
df.write.format("delta").mode("overwrite").saveAsTable(target_catalog + '.' + target_schema + '.' + tableName)
# This results in exceptions like:
# py4j.Py4JException: Method setProperty([class java.lang.String, class java.lang.String]) does not exist
with multiprocessing.Pool() as pool:
pool.map(replicateTable, tableNames)
# this works fine
for tableName in tableNames:
replicateTable(tableName)