My spark dataframe has 12 partitions (NYC trip records data for yellow ones for 2022). I’m trying to repartition my spark dataframe to level the size of its partitions.
taxi_df = taxi_df.repartition(10)
But as soon as I repartition it, running taxi_df.show()
results in the following error:
Py4JJavaError Traceback (most recent call last)
File ~Sparkspark-3.5.1-bin-hadoop3pythonpysparkerrorsexceptionscaptured.py:179, in capture_sql_exception.<locals>.deco(*a, **kw)
178 try:
--> 179 return f(*a, **kw)
180 except Py4JJavaError as e:
File ~Sparkspark-3.5.1-bin-hadoop3pythonlibpy4j-0.10.9.7-src.zippy4jprotocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.n".
328 format(target_id, ".", name), value)
329 else:
<class 'str'>: (<class 'ConnectionRefusedError'>, ConnectionRefusedError(10061, 'No connection could be made because the target machine actively refused it', None, 10061, None))
During handling of the above exception, another exception occurred:
Py4JError Traceback (most recent call last)
Cell In[10], line 1
----> 1 taxi_df.show()
File ~Sparkspark-3.5.1-bin-hadoop3pythonpysparksqldataframe.py:945, in DataFrame.show(self, n, truncate, vertical)
885 def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False) -> None:
886 """Prints the first ``n`` rows to the console.
887
888 .. versionadded:: 1.3.0
(...)
943 name | Bob
944 """
--> 945 print(self._show_string(n, truncate, vertical))
File ~Sparkspark-3.5.1-bin-hadoop3pythonpysparksqldataframe.py:963, in DataFrame._show_string(self, n, truncate, vertical)
957 raise PySparkTypeError(
958 error_class="NOT_BOOL",
959 message_parameters={"arg_name": "vertical", "arg_type": type(vertical).__name__},
960 )
962 if isinstance(truncate, bool) and truncate:
--> 963 return self._jdf.showString(n, 20, vertical)
964 else:
965 try:
File ~Sparkspark-3.5.1-bin-hadoop3pythonlibpy4j-0.10.9.7-src.zippy4jjava_gateway.py:1322, in JavaMember.__call__(self, *args)
1316 command = proto.CALL_COMMAND_NAME +
1317 self.command_header +
1318 args_command +
1319 proto.END_COMMAND_PART
1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1325 for temp_arg in temp_args:
1326 if hasattr(temp_arg, "_detach"):
File ~Sparkspark-3.5.1-bin-hadoop3pythonpysparkerrorsexceptionscaptured.py:181, in capture_sql_exception.<locals>.deco(*a, **kw)
179 return f(*a, **kw)
180 except Py4JJavaError as e:
--> 181 converted = convert_exception(e.java_exception)
182 if not isinstance(converted, UnknownException):
183 # Hide where the exception came from that shows a non-Pythonic
184 # JVM exception message.
185 raise converted from None
File ~Sparkspark-3.5.1-bin-hadoop3pythonpysparkerrorsexceptionscaptured.py:143, in convert_exception(e)
141 elif is_instance_of(gw, e, "java.lang.IllegalArgumentException"):
142 return IllegalArgumentException(origin=e)
--> 143 elif is_instance_of(gw, e, "java.lang.ArithmeticException"):
144 return ArithmeticException(origin=e)
145 elif is_instance_of(gw, e, "java.lang.UnsupportedOperationException"):
File ~Sparkspark-3.5.1-bin-hadoop3pythonlibpy4j-0.10.9.7-src.zippy4jjava_gateway.py:464, in is_instance_of(gateway, java_object, java_class)
460 else:
461 raise Py4JError(
462 "java_class must be a string, a JavaClass, or a JavaObject")
--> 464 return gateway.jvm.py4j.reflection.TypeUtil.isInstanceOf(
465 param, java_object)
File ~Sparkspark-3.5.1-bin-hadoop3pythonlibpy4j-0.10.9.7-src.zippy4jjava_gateway.py:1664, in JavaPackage.__getattr__(self, name)
1661 return JavaClass(
1662 answer[proto.CLASS_FQN_START:], self._gateway_client)
1663 else:
-> 1664 raise Py4JError("{0} does not exist in the JVM".format(new_fqn))
Py4JError: py4j.reflection does not exist in the JVM
The following part of the error is just printed (not thrown as an exception).
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 60537)
Traceback (most recent call last):
File "C:UsersjatinAppDataLocalProgramsPythonPython311Libsocketserver.py", line 317, in _handle_request_noblock
self.process_request(request, client_address)
File "C:UsersjatinAppDataLocalProgramsPythonPython311Libsocketserver.py", line 348, in process_request
self.finish_request(request, client_address)
File "C:UsersjatinAppDataLocalProgramsPythonPython311Libsocketserver.py", line 361, in finish_request
self.RequestHandlerClass(request, client_address, self)
File "C:UsersjatinAppDataLocalProgramsPythonPython311Libsocketserver.py", line 755, in __init__
self.handle()
File "C:UsersjatinSparkspark-3.5.1-bin-hadoop3pythonpysparkaccumulators.py", line 295, in handle
poll(accum_updates)
File "C:UsersjatinSparkspark-3.5.1-bin-hadoop3pythonpysparkaccumulators.py", line 267, in poll
if self.rfile in r and func():
^^^^^^
File "C:UsersjatinSparkspark-3.5.1-bin-hadoop3pythonpysparkaccumulators.py", line 271, in accum_updates
num_updates = read_int(self.rfile)
^^^^^^^^^^^^^^^^^^^^
File "C:UsersjatinSparkspark-3.5.1-bin-hadoop3pythonpysparkserializers.py", line 594, in read_int
length = stream.read(4)
^^^^^^^^^^^^^^
File "C:UsersjatinAppDataLocalProgramsPythonPython311Libsocket.py", line 706, in readinto
return self._sock.recv_into(b)
^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
----------------------------------------
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "C:UsersjatinSparkspark-3.5.1-bin-hadoop3pythonpysparkerrorsexceptionscaptured.py", line 179, in deco
return f(*a, **kw)
^^^^^^^^^^^
File "C:UsersjatinSparkspark-3.5.1-bin-hadoop3pythonlibpy4j-0.10.9.7-src.zippy4jprotocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: <exception str() failed>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:UsersjatinSparkspark-3.5.1-bin-hadoop3pythonlibpy4j-0.10.9.7-src.zippy4jclientserver.py", line 511, in send_command
answer = smart_decode(self.stream.readline()[:-1])
^^^^^^^^^^^^^^^^^^^^^^
File "C:UsersjatinAppDataLocalProgramsPythonPython311Libsocket.py", line 706, in readinto
return self._sock.recv_into(b)
^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:UsersjatinSparkspark-3.5.1-bin-hadoop3pythonlibpy4j-0.10.9.7-src.zippy4jjava_gateway.py", line 1038, in send_command
response = connection.send_command(command)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:UsersjatinSparkspark-3.5.1-bin-hadoop3pythonlibpy4j-0.10.9.7-src.zippy4jclientserver.py", line 539, in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving