I have 90 .csv files containing data and they all have the same columns. I would like read all the csv files and store them in a single table. The .csv files are all stored in a map called ‘Data’ in my workspace. When I keep getting the error that the path cannot be found.
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Initialize Spark session
spark = SparkSession.builder.appName("CombineCSVFiles").getOrCreate()
# Specify the path where the CSV files are stored in the workspace
workspace_path = "My workspace path"
# List of CSV files in the workspace directory
# Use dbutils.fs.ls for workspace paths if necessary
file_list = [file.path for file in dbutils.fs.ls(workspace_path) if file.path.endswith(".csv")]
# Initialize an empty list to hold DataFrames
df_list = []
# Read each CSV file into a DataFrame and add to the list
for file in file_list:
df = spark.read.option("header", "true").csv(file)
df_list.append(df)
# Combine all DataFrames into a single DataFrame
combined_df = df_list[0]
for df in df_list[1:]:
combined_df = combined_df.union(df)
# Convert the datetime column to Timestamp type if necessary
combined_df = combined_df.withColumn("datetime", col("datetime").cast("timestamp"))
# Sort the combined DataFrame by the datetime column
sorted_df = combined_df.orderBy(col("datetime"))
# Show the result (or you can save it to a table or file)
sorted_df.show()
# Optionally, save the combined DataFrame as a new table
sorted_df.write.format("delta").saveAsTable("combined_sorted_table")