I have a pandas dataframe which I am writing to S3 using Pyarrow engine. I have the data to be partitioned by Pyarrow engine throw error that more than 1024 partitions can not be written. Is there a way to overcome this limitation?
<code>df.to_parquet(s3_output_path,
compression='snappy',
engine = 'pyarrow',
basename_template = 'part-{i}' + '.parquet',
partition_cols = ['a_id'],
existing_data_behavior = 'overwrite_or_ignore')
</code>
<code>df.to_parquet(s3_output_path,
compression='snappy',
engine = 'pyarrow',
basename_template = 'part-{i}' + '.parquet',
partition_cols = ['a_id'],
existing_data_behavior = 'overwrite_or_ignore')
</code>
df.to_parquet(s3_output_path,
compression='snappy',
engine = 'pyarrow',
basename_template = 'part-{i}' + '.parquet',
partition_cols = ['a_id'],
existing_data_behavior = 'overwrite_or_ignore')
Stack trace
<code>timestamp,message
Traceback (most recent call last):
1727275500645," File ""/opt/predict.py"", line 149, in <module>"
1727275500645, llm.demo()
1727275500645," File ""/opt/predict.py"", line 99, in demo"
1727275500645," df1.to_parquet(s3_output_path, "
1727275500645," File ""/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py"", line 207, in wrapper"
1727275500645," return func(*args, **kwargs)"
1727275500645," File ""/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py"", line 2835, in to_parquet"
1727275500646, return to_parquet(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pandas/io/parquet.py"", line 420, in to_parquet"
1727275500646, impl.write(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pandas/io/parquet.py"", line 186, in write"
1727275500646, self.api.parquet.write_to_dataset(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pyarrow/parquet/__init__.py"", line 3153, in write_to_dataset"
1727275500646, ds.write_dataset(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pyarrow/dataset.py"", line 930, in write_dataset"
1727275500646, _filesystemdataset_write(
1727275500646," File ""pyarrow/_dataset.pyx"", line 2737, in pyarrow._dataset._filesystemdataset_write"," File ""pyarrow/error.pxi"", line 100, in pyarrow.lib.check_status",pyarrow.lib.ArrowInvalid: Fragment would be written into 34865 partitions. This exceeds the maximum of 1024
</code>
<code>timestamp,message
Traceback (most recent call last):
1727275500645," File ""/opt/predict.py"", line 149, in <module>"
1727275500645, llm.demo()
1727275500645," File ""/opt/predict.py"", line 99, in demo"
1727275500645," df1.to_parquet(s3_output_path, "
1727275500645," File ""/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py"", line 207, in wrapper"
1727275500645," return func(*args, **kwargs)"
1727275500645," File ""/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py"", line 2835, in to_parquet"
1727275500646, return to_parquet(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pandas/io/parquet.py"", line 420, in to_parquet"
1727275500646, impl.write(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pandas/io/parquet.py"", line 186, in write"
1727275500646, self.api.parquet.write_to_dataset(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pyarrow/parquet/__init__.py"", line 3153, in write_to_dataset"
1727275500646, ds.write_dataset(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pyarrow/dataset.py"", line 930, in write_dataset"
1727275500646, _filesystemdataset_write(
1727275500646," File ""pyarrow/_dataset.pyx"", line 2737, in pyarrow._dataset._filesystemdataset_write"," File ""pyarrow/error.pxi"", line 100, in pyarrow.lib.check_status",pyarrow.lib.ArrowInvalid: Fragment would be written into 34865 partitions. This exceeds the maximum of 1024
</code>
timestamp,message
Traceback (most recent call last):
1727275500645," File ""/opt/predict.py"", line 149, in <module>"
1727275500645, llm.demo()
1727275500645," File ""/opt/predict.py"", line 99, in demo"
1727275500645," df1.to_parquet(s3_output_path, "
1727275500645," File ""/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py"", line 207, in wrapper"
1727275500645," return func(*args, **kwargs)"
1727275500645," File ""/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py"", line 2835, in to_parquet"
1727275500646, return to_parquet(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pandas/io/parquet.py"", line 420, in to_parquet"
1727275500646, impl.write(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pandas/io/parquet.py"", line 186, in write"
1727275500646, self.api.parquet.write_to_dataset(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pyarrow/parquet/__init__.py"", line 3153, in write_to_dataset"
1727275500646, ds.write_dataset(
1727275500646," File ""/usr/local/lib/python3.10/dist-packages/pyarrow/dataset.py"", line 930, in write_dataset"
1727275500646, _filesystemdataset_write(
1727275500646," File ""pyarrow/_dataset.pyx"", line 2737, in pyarrow._dataset._filesystemdataset_write"," File ""pyarrow/error.pxi"", line 100, in pyarrow.lib.check_status",pyarrow.lib.ArrowInvalid: Fragment would be written into 34865 partitions. This exceeds the maximum of 1024
Update:
Added max_partitions
to the parameters and received unknown param error
<code>File "/usr/local/lib/python3.10/dist-packages/pandas/io/parquet.py", line 186, in write
2024-09-25T15:42:21.674Z
self.api.parquet.write_to_dataset(
2024-09-25T15:42:21.674Z
File "/usr/local/lib/python3.10/dist-packages/pyarrow/parquet/__init__.py", line 3137, in write_to_dataset
2024-09-25T15:42:21.675Z
write_options = parquet_format.make_write_options(**kwargs)
2024-09-25T15:42:21.675Z
File "pyarrow/_dataset_parquet.pyx", line 181, in pyarrow._dataset_parquet.ParquetFileFormat.make_write_options
2024-09-25T15:42:21.675Z
File "pyarrow/_dataset_parquet.pyx", line 514, in pyarrow._dataset_parquet.ParquetFileWriteOptions.update
2024-09-25T15:42:21.675Z
TypeError: unexpected parquet write option: max_partitions
</code>
<code>File "/usr/local/lib/python3.10/dist-packages/pandas/io/parquet.py", line 186, in write
2024-09-25T15:42:21.674Z
self.api.parquet.write_to_dataset(
2024-09-25T15:42:21.674Z
File "/usr/local/lib/python3.10/dist-packages/pyarrow/parquet/__init__.py", line 3137, in write_to_dataset
2024-09-25T15:42:21.675Z
write_options = parquet_format.make_write_options(**kwargs)
2024-09-25T15:42:21.675Z
File "pyarrow/_dataset_parquet.pyx", line 181, in pyarrow._dataset_parquet.ParquetFileFormat.make_write_options
2024-09-25T15:42:21.675Z
File "pyarrow/_dataset_parquet.pyx", line 514, in pyarrow._dataset_parquet.ParquetFileWriteOptions.update
2024-09-25T15:42:21.675Z
TypeError: unexpected parquet write option: max_partitions
</code>
File "/usr/local/lib/python3.10/dist-packages/pandas/io/parquet.py", line 186, in write
2024-09-25T15:42:21.674Z
self.api.parquet.write_to_dataset(
2024-09-25T15:42:21.674Z
File "/usr/local/lib/python3.10/dist-packages/pyarrow/parquet/__init__.py", line 3137, in write_to_dataset
2024-09-25T15:42:21.675Z
write_options = parquet_format.make_write_options(**kwargs)
2024-09-25T15:42:21.675Z
File "pyarrow/_dataset_parquet.pyx", line 181, in pyarrow._dataset_parquet.ParquetFileFormat.make_write_options
2024-09-25T15:42:21.675Z
File "pyarrow/_dataset_parquet.pyx", line 514, in pyarrow._dataset_parquet.ParquetFileWriteOptions.update
2024-09-25T15:42:21.675Z
TypeError: unexpected parquet write option: max_partitions
2
You can try to specify the max_partitions to be the number of partition you expect. But having a lot of partitions and a lot of open file will take time to write.
<code>df.to_parquet(
s3_output_path,
compression="snappy",
engine="pyarrow",
basename_template="part-{i}" + ".parquet",
partition_cols=["a_id"],
existing_data_behavior="overwrite_or_ignore",
max_partitions=df["a_id"].nunique(),
)
</code>
<code>df.to_parquet(
s3_output_path,
compression="snappy",
engine="pyarrow",
basename_template="part-{i}" + ".parquet",
partition_cols=["a_id"],
existing_data_behavior="overwrite_or_ignore",
max_partitions=df["a_id"].nunique(),
)
</code>
df.to_parquet(
s3_output_path,
compression="snappy",
engine="pyarrow",
basename_template="part-{i}" + ".parquet",
partition_cols=["a_id"],
existing_data_behavior="overwrite_or_ignore",
max_partitions=df["a_id"].nunique(),
)
2