In a console Google Cloud project I have several datasets with tables/views in them, so most of them have the query from which they came. I am trying to parse every single one of them to get their table dependencies. With “table dependencies” I mean the tables next to their FROM statements or JOIN statements, not returning aliases or whole subqueries/ SELECT statements.
I am using this code:
from google.cloud import bigquery
import sqlparse
from sqlparse.sql import IdentifierList, Identifier, Parenthesis, Token
from sqlparse.tokens import Keyword, DML
def extract_tables(parsed):
tables = set()
from_seen = False
for token in parsed.tokens:
if from_seen:
if isinstance(token, IdentifierList):
for identifier in token.get_identifiers():
tables.add(identifier.get_real_name())
elif isinstance(token, Identifier):
tables.add(token.get_real_name())
elif isinstance(token, Parenthesis):
subquery = extract_tables(token)
tables.update(subquery)
elif token.ttype is Keyword and token.value.upper() in ('WHERE', 'GROUP BY', 'HAVING', 'ORDER BY', 'LIMIT', 'UNION', 'EXCEPT', 'INTERSECT'):
from_seen = False
continue
if token.ttype is Keyword and token.value.upper() in ('FROM', 'JOIN'):
from_seen = True
return tables
def extract_table_names(sql):
tables = set()
parsed = sqlparse.parse(sql)
for stmt in parsed:
if stmt.get_type() == 'SELECT':
tables.update(extract_tables(stmt))
return tables
def get_table_query(client, dataset_id, table_id):
table = client.get_table(f"{dataset_id}.{table_id}")
if table.table_type == 'VIEW':
return table.view_query
return None
def list_tables_and_sources(project_id):
client = bigquery.Client(project=project_id)
datasets = list(client.list_datasets())
table_sources = {}
for dataset in datasets:
dataset_id = dataset.dataset_id
tables = list(client.list_tables(dataset_id))
for table in tables:
table_id = table.table_id
query = get_table_query(client, dataset_id, table_id)
if query:
source_tables = extract_table_names(query)
table_key = f"{dataset_id}.{table_id}"
table_sources[table_key] = list(source_tables) # Convert set to list
return table_sources
project_id = 'my-project-id'
table_sources = list_tables_and_sources(project_id)
import json
print(json.dumps(table_sources, indent=4))
I want my output to be in that schema {"my-project-id.reported_data.orders" : {dataset.tableName, dataset.TableName}}
.
Most tables are being parsed fine outputting exactly what it should be but some tables/queries are skipped entirely getting a result of an empty array.
For example, I have this view
SELECT
barcode,
CAST(CONCAT(min(EXTRACT(YEAR FROM active_date)), '-', min(EXTRACT(MONTH FROM active_date)), '-01') as DATE) as first_of_month,
count(active_date) as active_days,
FROM `my-project-id.erp_raw_data.fsn_lines`
GROUP BY barcode, (EXTRACT(YEAR FROM active_date)), EXTRACT(MONTH FROM active_date)
ORDER BY barcode, first_of_month;
But my output looks like this
{"erp_formatted_data.active_days": [],...}
Another view/query that after parsing returns an empty array is that one
SELECT
date,
barcode,
calendar_month,
calendar_year,
quarter,
image_url,
sold_quantity,
number_of_sales,
number_of_invoices,
invoiced_quantity,
invoiced_revenue,
views,
impressions,
supplier_name,
main_category,
sub_category,
(number_of_sales / views) as sales_per_views,
(number_of_sales / impressions) as sales_per_impressions,
IF(number_of_sales > 0 OR views > 0 OR impressions > 0 OR active_day = 1, 1, 0) as active_day,
FROM
`my-project-id.erp_formatted_data.daily_barcodes_stats`
I have no idea what the problem could be in my code. Any ideas?