I am working on a project to optimize SQL queries by determining the optimal join order using dynamic programming in Python. My goal is to minimize the total cost of joins.
Context
I am connecting to a PostgreSQL database and need to calculate the join costs for different combinations of tables. I have already written functions to calculate the scan cost of a table and the join cost between two sets of tables.
Here is an excerpt of my current code:
import psycopg2
import json
conn = psycopg2.connect(
dbname="job",
user="postgres",
password="ikram",
host="localhost",
port="5627"
)
def calculate_cost(solution, conn):
total_cost = 0
try:
with conn.cursor() as cur:
for table in solution:
cur.execute(f"EXPLAIN (ANALYZE, FORMAT JSON) SELECT * FROM {table};")
explain_result = cur.fetchone()[0]
explain_json = json.loads(explain_result) if isinstance(explain_result, str) else explain_result
total_cost += float(explain_json['Plan']['Total Cost'])
conn.commit()
except Exception as e:
print(f"Error calculating cost: {e}")
conn.rollback()
return total_cost
def remove_lower_cost_equivalents(partial_solutions, conn):
min_cost = float('inf')
best_solutions = set()
for solution in partial_solutions:
total_cost = calculate_cost(solution, conn)
if total_cost < min_cost:
min_cost = total_cost
best_solutions = {solution}
elif total_cost == min_cost:
best_solutions.add(solution)
return best_solutions
def DynamicProgramming(Rels, conn):
partialsolutions = {frozenset({rel}) for rel in Rels}
for i in range(1, len(Rels)):
new_partial_solutions = set()
for partial_tree in partialsolutions:
for rel in Rels:
if rel not in partial_tree:
new_partial_tree = partial_tree.union({rel})
new_partial_solutions.add(new_partial_tree)
partialsolutions = remove_lower_cost_equivalents(new_partial_solutions, conn)
optimal_solution = next(iter(partialsolutions))
optimal_cost = calculate_cost(optimal_solution, conn)
optimal_order = list(optimal_solution)
return len(Rels), optimal_cost, optimal_order
# Running the code
relations = ['title', 'movie_info', 'complete_cast']
nb_relations, cost, optimal_order = DynamicProgramming(relations, conn)
print(f"Number of relations: {nb_relations}")
print(f"Optimal cost: {cost}")
print(f"Optimal order: {optimal_order}")
conn.close()
I am encountering issues with the join costs being very high, and I am unsure how to improve the implementation to get more realistic costs. Additionally, I would like to know if there is a method to pre-calculate and store the join costs between all combinations of tables and then use them to optimize the join order.
user25210056 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.