Thiết kế website giá rẻ

Question

I want to compare the performance of some json alternatives in Python. I set up a test as follows, comparing the stanadrd json library with orjson and protobuf.

<code>import orjson

import json

import time

import sys

import message_pb2

from google.protobuf.internal import api_implementation

from google.protobuf.json_format import MessageToDict

print(f"This should be 'upb': {api_implementation.Type()=}n")

message = message_pb2.ChatMessage()

parsed_message = message_pb2.ChatMessage()

message.sender = "Test Name"

message.active_connections = 40

message.memory_usage = 0.554781258874

message.data.extend(["Hammer", "Radio"])

x = {

"name": "Test Name",

"age": 40,

"random_float": 0.554781258874,

"items": ["Hammer", "Radio"]

}

NUM_TESTS = 1000000

def perf_tests():

# Protobuf

t0 = time.perf_counter()

for _ in range(NUM_TESTS):

binary_data = message.SerializeToString()

t1 = time.perf_counter()

proto_serialize_time = (t1 - t0)*1000

print(f"nProtobuf serialize: {proto_serialize_time:,.0f} ms")

for _ in range(NUM_TESTS):

parsed_message.ParseFromString(binary_data)

t2 = time.perf_counter()

proto_deserialize_time = (t2 - t1)*1000

print(f"Protobuf deserialize: {proto_deserialize_time:,.0f} ms")

print(f"Protobuf total: {proto_serialize_time + proto_deserialize_time:,.0f} ms")

print(f"Protobuf file size: {sys.getsizeof(binary_data)} b")

# Orjson

t0 = time.perf_counter()

for _ in range(NUM_TESTS):

x_dump = orjson.dumps(x)

t1 = time.perf_counter()

orjson_serialize_time = (t1 - t0)*1000

print(f"nOrjson serialize: {orjson_serialize_time:,.0f} ms")

for _ in range(NUM_TESTS):

x_1 = orjson.loads(x_dump)

t2 = time.perf_counter()

orjson_deserialize_time = (t2 - t1)*1000

print(f"Orjson deserialize: {orjson_deserialize_time:,.0f} ms")

print(f"Orjson total: {orjson_serialize_time + orjson_deserialize_time:,.0f} ms")

print(f"Orjson file size: {sys.getsizeof(x_dump)} b")

# Json

t0 = time.perf_counter()

for _ in range(NUM_TESTS):

x_dump = json.dumps(x)

t1 = time.perf_counter()

json_serialize_time = (t1 - t0)*1000

print(f"nJson serialize: {json_serialize_time:,.0f} ms")

for _ in range(NUM_TESTS):

x_1 = json.loads(x_dump)

t2 = time.perf_counter()

json_deserialize_time = (t2 - t1)*1000

print(f"Json deserialize: {json_deserialize_time:,.0f} ms")

print(f"Json total: {json_serialize_time + json_deserialize_time:,.0f} ms")

print(f"Json file size: {sys.getsizeof(x_dump)} b")

if __name__ == "__main__":

perf_tests()

</code>

<code>import orjson import json import time import sys import message_pb2 from google.protobuf.internal import api_implementation from google.protobuf.json_format import MessageToDict print(f"This should be 'upb': {api_implementation.Type()=}n") message = message_pb2.ChatMessage() parsed_message = message_pb2.ChatMessage() message.sender = "Test Name" message.active_connections = 40 message.memory_usage = 0.554781258874 message.data.extend(["Hammer", "Radio"]) x = { "name": "Test Name", "age": 40, "random_float": 0.554781258874, "items": ["Hammer", "Radio"] } NUM_TESTS = 1000000 def perf_tests(): # Protobuf t0 = time.perf_counter() for _ in range(NUM_TESTS): binary_data = message.SerializeToString() t1 = time.perf_counter() proto_serialize_time = (t1 - t0)*1000 print(f"nProtobuf serialize: {proto_serialize_time:,.0f} ms") for _ in range(NUM_TESTS): parsed_message.ParseFromString(binary_data) t2 = time.perf_counter() proto_deserialize_time = (t2 - t1)*1000 print(f"Protobuf deserialize: {proto_deserialize_time:,.0f} ms") print(f"Protobuf total: {proto_serialize_time + proto_deserialize_time:,.0f} ms") print(f"Protobuf file size: {sys.getsizeof(binary_data)} b") # Orjson t0 = time.perf_counter() for _ in range(NUM_TESTS): x_dump = orjson.dumps(x) t1 = time.perf_counter() orjson_serialize_time = (t1 - t0)*1000 print(f"nOrjson serialize: {orjson_serialize_time:,.0f} ms") for _ in range(NUM_TESTS): x_1 = orjson.loads(x_dump) t2 = time.perf_counter() orjson_deserialize_time = (t2 - t1)*1000 print(f"Orjson deserialize: {orjson_deserialize_time:,.0f} ms") print(f"Orjson total: {orjson_serialize_time + orjson_deserialize_time:,.0f} ms") print(f"Orjson file size: {sys.getsizeof(x_dump)} b") # Json t0 = time.perf_counter() for _ in range(NUM_TESTS): x_dump = json.dumps(x) t1 = time.perf_counter() json_serialize_time = (t1 - t0)*1000 print(f"nJson serialize: {json_serialize_time:,.0f} ms") for _ in range(NUM_TESTS): x_1 = json.loads(x_dump) t2 = time.perf_counter() json_deserialize_time = (t2 - t1)*1000 print(f"Json deserialize: {json_deserialize_time:,.0f} ms") print(f"Json total: {json_serialize_time + json_deserialize_time:,.0f} ms") print(f"Json file size: {sys.getsizeof(x_dump)} b") if __name__ == "__main__": perf_tests() </code>

import orjson
import json
import time
import sys
import message_pb2
from google.protobuf.internal import api_implementation
from google.protobuf.json_format import MessageToDict
print(f"This should be 'upb': {api_implementation.Type()=}n")


message = message_pb2.ChatMessage()
parsed_message = message_pb2.ChatMessage()

message.sender = "Test Name"
message.active_connections = 40
message.memory_usage = 0.554781258874
message.data.extend(["Hammer", "Radio"])

x = {
  "name": "Test Name",
  "age": 40,
  "random_float": 0.554781258874,
  "items": ["Hammer", "Radio"]
}

NUM_TESTS = 1000000

def perf_tests():
    # Protobuf
    t0 = time.perf_counter()
    for _ in range(NUM_TESTS):
        binary_data = message.SerializeToString()
    t1 = time.perf_counter()
    proto_serialize_time = (t1 - t0)*1000
    print(f"nProtobuf serialize:   {proto_serialize_time:,.0f} ms")

    for _ in range(NUM_TESTS):
        parsed_message.ParseFromString(binary_data)
    t2 = time.perf_counter()
    proto_deserialize_time = (t2 - t1)*1000
    print(f"Protobuf deserialize: {proto_deserialize_time:,.0f} ms")
    print(f"Protobuf total:       {proto_serialize_time + proto_deserialize_time:,.0f} ms")
    print(f"Protobuf file size:   {sys.getsizeof(binary_data)} b")

    # Orjson
    t0 = time.perf_counter()
    for _ in range(NUM_TESTS):
        x_dump = orjson.dumps(x)
    t1 = time.perf_counter()
    orjson_serialize_time = (t1 - t0)*1000
    print(f"nOrjson serialize:     {orjson_serialize_time:,.0f} ms")

    for _ in range(NUM_TESTS):
        x_1 = orjson.loads(x_dump)
    t2 = time.perf_counter()
    orjson_deserialize_time = (t2 - t1)*1000
    print(f"Orjson deserialize:   {orjson_deserialize_time:,.0f} ms")
    print(f"Orjson total:         {orjson_serialize_time + orjson_deserialize_time:,.0f} ms")
    print(f"Orjson file size:     {sys.getsizeof(x_dump)} b")

    # Json
    t0 = time.perf_counter()
    for _ in range(NUM_TESTS):
        x_dump = json.dumps(x)
    t1 = time.perf_counter()
    json_serialize_time = (t1 - t0)*1000
    print(f"nJson serialize:       {json_serialize_time:,.0f} ms")

    for _ in range(NUM_TESTS):
        x_1 = json.loads(x_dump)
    t2 = time.perf_counter()
    json_deserialize_time = (t2 - t1)*1000
    print(f"Json deserialize:     {json_deserialize_time:,.0f} ms")
    print(f"Json total:           {json_serialize_time + json_deserialize_time:,.0f} ms")
    print(f"Json file size:       {sys.getsizeof(x_dump)} b")


if __name__ == "__main__":
    perf_tests()

When I run the test like that, I get the following results:

<code>Protobuf serialize: 403 ms

Protobuf deserialize: 344 ms

Protobuf total: 747 ms

Protobuf file size: 64 b

Orjson serialize: 407 ms

Orjson deserialize: 645 ms

Orjson total: 1,052 ms

Orjson file size: 116 b

Json serialize: 3,588 ms

Json deserialize: 2,630 ms

Json total: 6,219 ms

Json file size: 132 b

</code>

<code>Protobuf serialize: 403 ms Protobuf deserialize: 344 ms Protobuf total: 747 ms Protobuf file size: 64 b Orjson serialize: 407 ms Orjson deserialize: 645 ms Orjson total: 1,052 ms Orjson file size: 116 b Json serialize: 3,588 ms Json deserialize: 2,630 ms Json total: 6,219 ms Json file size: 132 b </code>

Protobuf serialize:   403 ms
Protobuf deserialize: 344 ms
Protobuf total:       747 ms
Protobuf file size:   64 b

Orjson serialize:     407 ms
Orjson deserialize:   645 ms
Orjson total:         1,052 ms
Orjson file size:     116 b

Json serialize:       3,588 ms
Json deserialize:     2,630 ms
Json total:           6,219 ms
Json file size:       132 b

This is all as expected – standard json library is the slowest, orjson is much faster than json with a small reduction in filesize, and protobuf is the fastest while also having a much smaller filesize.
However, the deserialized data that I get is still a protobuf message type. Any attempt that I make to convert it to a dictionary absolutely destroys the performance. Using MessageToDict makes deserialization go from 344 ms to around 12,000 ms, which is worse than the standard json library.
Building a dictionary manually using the message object attributes still pushes the deserialization time to around 1,000 ms, so once again it loses to orjson.

<code>x_dict = {

"name": parsed_message.sender,

"age": parsed_message.active_connections,

"random_float": parsed_message.memory_usage,

"items": parsed_message.data,

}

</code>

<code>x_dict = { "name": parsed_message.sender, "age": parsed_message.active_connections, "random_float": parsed_message.memory_usage, "items": parsed_message.data, } </code>

x_dict = {
            "name": parsed_message.sender,
            "age": parsed_message.active_connections,
            "random_float": parsed_message.memory_usage,
            "items": parsed_message.data,
         }

For my application messages have several optional fields so I would have to build several different manual dictionary builders which would not be ideal. I like the reduction in filesize, but I’m surprised that there doesn’t seem to be a better and more performant way of doing this, unless I’m missing something?

Thiết kế website giá rẻ

Danh mục

Efficiently creating python dict from protobuf deserialized message