I want to compare the performance of some json alternatives in Python. I set up a test as follows, comparing the stanadrd json library with orjson and protobuf.
from google.protobuf.internal import api_implementation
from google.protobuf.json_format import MessageToDict
print(f"This should be 'upb': {api_implementation.Type()=}n")
message = message_pb2.ChatMessage()
parsed_message = message_pb2.ChatMessage()
message.sender = "Test Name"
message.active_connections = 40
message.memory_usage = 0.554781258874
message.data.extend(["Hammer", "Radio"])
"random_float": 0.554781258874,
"items": ["Hammer", "Radio"]
for _ in range(NUM_TESTS):
binary_data = message.SerializeToString()
proto_serialize_time = (t1 - t0)*1000
print(f"nProtobuf serialize: {proto_serialize_time:,.0f} ms")
for _ in range(NUM_TESTS):
parsed_message.ParseFromString(binary_data)
proto_deserialize_time = (t2 - t1)*1000
print(f"Protobuf deserialize: {proto_deserialize_time:,.0f} ms")
print(f"Protobuf total: {proto_serialize_time + proto_deserialize_time:,.0f} ms")
print(f"Protobuf file size: {sys.getsizeof(binary_data)} b")
for _ in range(NUM_TESTS):
orjson_serialize_time = (t1 - t0)*1000
print(f"nOrjson serialize: {orjson_serialize_time:,.0f} ms")
for _ in range(NUM_TESTS):
x_1 = orjson.loads(x_dump)
orjson_deserialize_time = (t2 - t1)*1000
print(f"Orjson deserialize: {orjson_deserialize_time:,.0f} ms")
print(f"Orjson total: {orjson_serialize_time + orjson_deserialize_time:,.0f} ms")
print(f"Orjson file size: {sys.getsizeof(x_dump)} b")
for _ in range(NUM_TESTS):
json_serialize_time = (t1 - t0)*1000
print(f"nJson serialize: {json_serialize_time:,.0f} ms")
for _ in range(NUM_TESTS):
json_deserialize_time = (t2 - t1)*1000
print(f"Json deserialize: {json_deserialize_time:,.0f} ms")
print(f"Json total: {json_serialize_time + json_deserialize_time:,.0f} ms")
print(f"Json file size: {sys.getsizeof(x_dump)} b")
if __name__ == "__main__":
<code>import orjson
import json
import time
import sys
import message_pb2
from google.protobuf.internal import api_implementation
from google.protobuf.json_format import MessageToDict
print(f"This should be 'upb': {api_implementation.Type()=}n")
message = message_pb2.ChatMessage()
parsed_message = message_pb2.ChatMessage()
message.sender = "Test Name"
message.active_connections = 40
message.memory_usage = 0.554781258874
message.data.extend(["Hammer", "Radio"])
x = {
"name": "Test Name",
"age": 40,
"random_float": 0.554781258874,
"items": ["Hammer", "Radio"]
}
NUM_TESTS = 1000000
def perf_tests():
# Protobuf
t0 = time.perf_counter()
for _ in range(NUM_TESTS):
binary_data = message.SerializeToString()
t1 = time.perf_counter()
proto_serialize_time = (t1 - t0)*1000
print(f"nProtobuf serialize: {proto_serialize_time:,.0f} ms")
for _ in range(NUM_TESTS):
parsed_message.ParseFromString(binary_data)
t2 = time.perf_counter()
proto_deserialize_time = (t2 - t1)*1000
print(f"Protobuf deserialize: {proto_deserialize_time:,.0f} ms")
print(f"Protobuf total: {proto_serialize_time + proto_deserialize_time:,.0f} ms")
print(f"Protobuf file size: {sys.getsizeof(binary_data)} b")
# Orjson
t0 = time.perf_counter()
for _ in range(NUM_TESTS):
x_dump = orjson.dumps(x)
t1 = time.perf_counter()
orjson_serialize_time = (t1 - t0)*1000
print(f"nOrjson serialize: {orjson_serialize_time:,.0f} ms")
for _ in range(NUM_TESTS):
x_1 = orjson.loads(x_dump)
t2 = time.perf_counter()
orjson_deserialize_time = (t2 - t1)*1000
print(f"Orjson deserialize: {orjson_deserialize_time:,.0f} ms")
print(f"Orjson total: {orjson_serialize_time + orjson_deserialize_time:,.0f} ms")
print(f"Orjson file size: {sys.getsizeof(x_dump)} b")
# Json
t0 = time.perf_counter()
for _ in range(NUM_TESTS):
x_dump = json.dumps(x)
t1 = time.perf_counter()
json_serialize_time = (t1 - t0)*1000
print(f"nJson serialize: {json_serialize_time:,.0f} ms")
for _ in range(NUM_TESTS):
x_1 = json.loads(x_dump)
t2 = time.perf_counter()
json_deserialize_time = (t2 - t1)*1000
print(f"Json deserialize: {json_deserialize_time:,.0f} ms")
print(f"Json total: {json_serialize_time + json_deserialize_time:,.0f} ms")
print(f"Json file size: {sys.getsizeof(x_dump)} b")
if __name__ == "__main__":
perf_tests()
</code>
import orjson
import json
import time
import sys
import message_pb2
from google.protobuf.internal import api_implementation
from google.protobuf.json_format import MessageToDict
print(f"This should be 'upb': {api_implementation.Type()=}n")
message = message_pb2.ChatMessage()
parsed_message = message_pb2.ChatMessage()
message.sender = "Test Name"
message.active_connections = 40
message.memory_usage = 0.554781258874
message.data.extend(["Hammer", "Radio"])
x = {
"name": "Test Name",
"age": 40,
"random_float": 0.554781258874,
"items": ["Hammer", "Radio"]
}
NUM_TESTS = 1000000
def perf_tests():
# Protobuf
t0 = time.perf_counter()
for _ in range(NUM_TESTS):
binary_data = message.SerializeToString()
t1 = time.perf_counter()
proto_serialize_time = (t1 - t0)*1000
print(f"nProtobuf serialize: {proto_serialize_time:,.0f} ms")
for _ in range(NUM_TESTS):
parsed_message.ParseFromString(binary_data)
t2 = time.perf_counter()
proto_deserialize_time = (t2 - t1)*1000
print(f"Protobuf deserialize: {proto_deserialize_time:,.0f} ms")
print(f"Protobuf total: {proto_serialize_time + proto_deserialize_time:,.0f} ms")
print(f"Protobuf file size: {sys.getsizeof(binary_data)} b")
# Orjson
t0 = time.perf_counter()
for _ in range(NUM_TESTS):
x_dump = orjson.dumps(x)
t1 = time.perf_counter()
orjson_serialize_time = (t1 - t0)*1000
print(f"nOrjson serialize: {orjson_serialize_time:,.0f} ms")
for _ in range(NUM_TESTS):
x_1 = orjson.loads(x_dump)
t2 = time.perf_counter()
orjson_deserialize_time = (t2 - t1)*1000
print(f"Orjson deserialize: {orjson_deserialize_time:,.0f} ms")
print(f"Orjson total: {orjson_serialize_time + orjson_deserialize_time:,.0f} ms")
print(f"Orjson file size: {sys.getsizeof(x_dump)} b")
# Json
t0 = time.perf_counter()
for _ in range(NUM_TESTS):
x_dump = json.dumps(x)
t1 = time.perf_counter()
json_serialize_time = (t1 - t0)*1000
print(f"nJson serialize: {json_serialize_time:,.0f} ms")
for _ in range(NUM_TESTS):
x_1 = json.loads(x_dump)
t2 = time.perf_counter()
json_deserialize_time = (t2 - t1)*1000
print(f"Json deserialize: {json_deserialize_time:,.0f} ms")
print(f"Json total: {json_serialize_time + json_deserialize_time:,.0f} ms")
print(f"Json file size: {sys.getsizeof(x_dump)} b")
if __name__ == "__main__":
perf_tests()
When I run the test like that, I get the following results:
<code>Protobuf serialize: 403 ms
Protobuf deserialize: 344 ms
Orjson deserialize: 645 ms
Json deserialize: 2,630 ms
<code>Protobuf serialize: 403 ms
Protobuf deserialize: 344 ms
Protobuf total: 747 ms
Protobuf file size: 64 b
Orjson serialize: 407 ms
Orjson deserialize: 645 ms
Orjson total: 1,052 ms
Orjson file size: 116 b
Json serialize: 3,588 ms
Json deserialize: 2,630 ms
Json total: 6,219 ms
Json file size: 132 b
</code>
Protobuf serialize: 403 ms
Protobuf deserialize: 344 ms
Protobuf total: 747 ms
Protobuf file size: 64 b
Orjson serialize: 407 ms
Orjson deserialize: 645 ms
Orjson total: 1,052 ms
Orjson file size: 116 b
Json serialize: 3,588 ms
Json deserialize: 2,630 ms
Json total: 6,219 ms
Json file size: 132 b
This is all as expected – standard json library is the slowest, orjson is much faster than json with a small reduction in filesize, and protobuf is the fastest while also having a much smaller filesize.
However, the deserialized data that I get is still a protobuf message type. Any attempt that I make to convert it to a dictionary absolutely destroys the performance. Using MessageToDict
makes deserialization go from 344 ms to around 12,000 ms, which is worse than the standard json library.
Building a dictionary manually using the message object attributes still pushes the deserialization time to around 1,000 ms, so once again it loses to orjson.
"name": parsed_message.sender,
"age": parsed_message.active_connections,
"random_float": parsed_message.memory_usage,
"items": parsed_message.data,
<code>x_dict = {
"name": parsed_message.sender,
"age": parsed_message.active_connections,
"random_float": parsed_message.memory_usage,
"items": parsed_message.data,
}
</code>
x_dict = {
"name": parsed_message.sender,
"age": parsed_message.active_connections,
"random_float": parsed_message.memory_usage,
"items": parsed_message.data,
}
For my application messages have several optional fields so I would have to build several different manual dictionary builders which would not be ideal. I like the reduction in filesize, but I’m surprised that there doesn’t seem to be a better and more performant way of doing this, unless I’m missing something?