I need to import Database.xlsx in jupyter and be able to work on it with sql commands.
I’m using this code:
!pip install pandas==1.3.3
import pandas as pd
from pyspark.sql import SparkSession
# Initialize Spark session with Arrow disabled
spark = SparkSession.builder
.appName("ExcelToSpark")
.config("spark.sql.execution.arrow.pyspark.enabled", "false")
.getOrCreate()
# Read the Excel file using Pandas
pdf = pd.read_excel('/home/jovyan/Database.xlsx', sheet_name='DB')
# Convert Pandas DataFrame to PySpark DataFrame
sdf = spark.createDataFrame(pdf)
sdf.createOrReplaceTempView('DB')
which is not working and gives me the following error message
AttributeError Traceback (most recent call last)
/tmp/ipykernel_222/4258738796.py in ?()
10 # Read the Excel file using Pandas
11 pdf = pd.read_excel(‘/home/jovyan/Database.xlsx’, sheet_name=’DB’)
12
13 # Convert Pandas DataFrame to PySpark DataFrame
—> 14 sdf = spark.createDataFrame(pdf)
15 sdf.createOrReplaceTempView(‘DB’)
/usr/local/spark/python/pyspark/sql/session.py in ?(self, data, schema, samplingRatio, verifySchema)
887 except Exception:
888 has_pandas = False
889 if has_pandas and isinstance(data, pandas.DataFrame):
890 # Create a DataFrame from pandas DataFrame.
–> 891 return super(SparkSession, self).createDataFrame( # type: ignore[call-overload]
892 data, schema, samplingRatio, verifySchema
893 )
894 return self._create_dataframe(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in ?(self, data, schema, samplingRatio, verifySchema)
432 “has been set to false.n %s” % str(e)
433 )
434 warn(msg)
435 raise
–> 436 converted_data = self._convert_from_pandas(data, schema, timezone)
437 return self._create_dataframe(converted_data, schema, samplingRatio, verifySchema)
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in ?(self, pdf, schema, timezone)
470 copied = True
471 pdf[field.name] = s
472 else:
473 should_localize = not is_timestamp_ntz_preferred()
–> 474 for column, series in pdf.iteritems():
475 s = series
476 if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:
477 s = _check_series_convert_timestamps_tz_local(series, timezone)
/opt/conda/lib/python3.10/site-packages/pandas/core/generic.py in ?(self, name)
5985 ———-
5986 datetime : bool, default False
5987 If True, convert to date where possible.
5988 numeric : bool, default False
-> 5989 If True, attempt to convert to numbers (including strings), with
5990 unconvertible values becoming NaN.
5991 timedelta : bool, default False
5992 If True, convert to timedelta where possible.
AttributeError: ‘DataFrame’ object has no attribute ‘iteritems’
can please somebody help?
thank you in advance