Thiết kế website giá rẻ

Question

I’m trying to load a large DataFrame into a PostgreSQL table and I’m considering two different approaches. I would like to know which one is generally faster, or if there’s a better approach I should consider.

Method 1: Convert PySpark DataFrame to multiple CSV files and load them into PostgreSQL

<code>filepath = "/base_path/psql_multiprocessing_data"

df.repartition(400).write.mode("overwrite").format("csv").save(filepath, header='false')

import glob

import psycopg2

from multiprocessing import Pool, cpu_count

file_path_list = sorted(glob.glob("/base_path/psql_multiprocessing_data/*.csv"))

def psql_copy_load(fileName):

con = psycopg2.connect(database="my_db", user="my_user", password="my_password", host="my_host", port="my_port")

cursor = con.cursor()

with open(fileName, 'r') as f:

cursor.copy_from(f, 'my_schema.my_table', sep=",")

con.commit()

con.close()

return fileName

with Pool(cpu_count()) as p:

p.map(psql_copy_load, file_path_list)

</code>

<code>filepath = "/base_path/psql_multiprocessing_data" df.repartition(400).write.mode("overwrite").format("csv").save(filepath, header='false') import glob import psycopg2 from multiprocessing import Pool, cpu_count file_path_list = sorted(glob.glob("/base_path/psql_multiprocessing_data/*.csv")) def psql_copy_load(fileName): con = psycopg2.connect(database="my_db", user="my_user", password="my_password", host="my_host", port="my_port") cursor = con.cursor() with open(fileName, 'r') as f: cursor.copy_from(f, 'my_schema.my_table', sep=",") con.commit() con.close() return fileName with Pool(cpu_count()) as p: p.map(psql_copy_load, file_path_list) </code>

filepath = "/base_path/psql_multiprocessing_data"
df.repartition(400).write.mode("overwrite").format("csv").save(filepath, header='false')

import glob
import psycopg2
from multiprocessing import Pool, cpu_count

file_path_list = sorted(glob.glob("/base_path/psql_multiprocessing_data/*.csv"))

def psql_copy_load(fileName):
    con = psycopg2.connect(database="my_db", user="my_user", password="my_password", host="my_host", port="my_port")
    cursor = con.cursor()
    with open(fileName, 'r') as f:
        cursor.copy_from(f, 'my_schema.my_table', sep=",")
    con.commit()
    con.close()
    return fileName

with Pool(cpu_count()) as p:
    p.map(psql_copy_load, file_path_list)

Method 2: Convert the PySpark DataFrame into smaller pandas DataFrame chunks, write to a StringIO buffer, and load directly into PostgreSQL

<code>import math

from io import StringIO

chunks = math.ceil(df_ops.count / 100000)

chunk_result = None

chunk_count = 0

for i in range(0, chunks):

start_value = i * 100000

end_value = (i + 1) * 100000

df = df_ops.get_chunk(start_value, end_value) # custom function to get a chunk of 100,000 rows

sio = StringIO()

sio.write(df.toPandas().to_csv(index=None, header=None, sep=separator, na_rep='nan'))

sio.seek(0)

raw_connections = engine.raw_connection()

with raw_connections.cursor() as c:

c.copy_from(sio, table_name, columns=df.columns, sep=separator)

raw_connections.commit()

</code>

<code>import math from io import StringIO chunks = math.ceil(df_ops.count / 100000) chunk_result = None chunk_count = 0 for i in range(0, chunks): start_value = i * 100000 end_value = (i + 1) * 100000 df = df_ops.get_chunk(start_value, end_value) # custom function to get a chunk of 100,000 rows sio = StringIO() sio.write(df.toPandas().to_csv(index=None, header=None, sep=separator, na_rep='nan')) sio.seek(0) raw_connections = engine.raw_connection() with raw_connections.cursor() as c: c.copy_from(sio, table_name, columns=df.columns, sep=separator) raw_connections.commit() </code>

import math
from io import StringIO

chunks = math.ceil(df_ops.count / 100000)
chunk_result = None
chunk_count = 0

for i in range(0, chunks):
    start_value = i * 100000
    end_value = (i + 1) * 100000
    df = df_ops.get_chunk(start_value, end_value)  # custom function to get a chunk of 100,000 rows
    sio = StringIO()
    sio.write(df.toPandas().to_csv(index=None, header=None, sep=separator, na_rep='nan'))
    sio.seek(0)
    raw_connections = engine.raw_connection()
    with raw_connections.cursor() as c:
        c.copy_from(sio, table_name, columns=df.columns, sep=separator)
        raw_connections.commit()

Context:
The DataFrame is very large, and I’m trying to optimize the load time into PostgreSQL.
Method 1 utilizes multiprocessing and CSV files, while Method 2 processes smaller chunks of data in memory and loads them directly.

Question:
Which of these methods is likely to be faster in practice?
Are there any significant trade-offs between these two methods that I should be aware of?
Is there a more efficient way to handle this type of data loading?

Any insights or recommendations would be greatly appreciated!

Thiết kế website giá rẻ

Danh mục

Which method is faster for loading large PySpark DataFrame into a PostgreSQL table? [closed]