I’m trying to create a key feature data mart using nested structs in BigQuery. Everything is working fine except that an unwanted random field is being created in the inner ARRAY_STRUCT (e.d struct) while creating the final CTE “fin_table.”
It seems that the number of columns being created exceeds the number specified in the struct_fields variable for dynamic fields, which is denoted by %s.↳
The random field has a random integer at the end of the field name, such as “_field_9.” Because of this field, I can’t append new data.
Please let me know how to get rid of this random field.
Thanks!
current result table schema is like below (problematic field is _field_9)
{‘Event_Date’: ‘DATE’,
‘Airbridge_Device_ID’: ‘STRING’,
…
…
‘e’: {‘type’: ‘RECORD’,
‘mode’: ‘REPEATED’,
‘fields’: {‘Label’: ‘STRING’,
‘Action’: ‘STRING’,
‘Event_Value_Sum’: ‘FLOAT’,
‘Event_Count’: ‘INTEGER’,
‘u’: {‘type’: ‘RECORD’,
‘mode’: ‘REPEATED’,
‘fields’: {‘User_ID’: ‘STRING’}},
‘d’: {‘type’: ‘RECORD’,
‘mode’: ‘REPEATED’,
‘fields’: {‘Timestamp’: ‘STRING’,
‘Label’: ‘STRING’,
‘Action’: ‘STRING’,
‘Value’: ‘FLOAT’,
‘tester’: ‘STRING’,
‘products_struct_alias’: {‘type’: ‘RECORD’,
‘mode’: ‘REPEATED’,
‘fields’: {‘name’: ‘STRING’, ‘price’: ‘STRING’, ‘position’: ‘STRING’}},
‘productID’: ‘STRING’,
‘price’: ‘STRING’,
‘name’: ‘STRING’,
‘_field_9’: ‘STRING’,
‘transactionID’: ‘STRING’,
}}}}
query is like below
-- Declare UDF, JSON parsing
CREATE TEMP FUNCTION extract_key_value(json STRING, key STRING)
RETURNS STRING
LANGUAGE js AS """
var obj = JSON.parse(json);
return obj[key] || null;
""";
-- Declare UDF, check key existence
CREATE TEMP FUNCTION has_key(json STRING, key STRING)
RETURNS BOOL
LANGUAGE js AS """
var obj = JSON.parse(json);
return obj.hasOwnProperty(key);
""";
-- BEGIN ~ END
BEGIN
-- ---------------------------------------------------------------- #
-- Declare variables, must be used the same number of times
-- ---------------------------------------------------------------- #
DECLARE key_list STRING DEFAULT '';
DECLARE struct_fields STRING DEFAULT '';
-- ---------------------------------------------------------------- #
-- Generate key list for dynamic pivot query
-- ---------------------------------------------------------------- #
SET key_list = (
WITH json_table AS (
SELECT
Event_Date, Event_Datetime,
Airbridge_Device_ID, Airbridge_Device_ID_Type, User_ID,
Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Category, Event_Label, Event_Action, Event_Value,
CASE
WHEN Custom_Event_Properties = '{}' THEN '{"tester": "true"}'
ELSE Custom_Event_Properties
END AS Custom_Event_Properties,
Semantic_Event_Properties
FROM
`airbridge_lake.app_2024`
WHERE
Event_Date = '2024-07-13'
),
keys_extracted AS (
SELECT
REGEXP_EXTRACT_ALL(Semantic_Event_Properties, r'"([^"]+)":') AS semantic_keys,
REGEXP_EXTRACT_ALL(Custom_Event_Properties, r'"([^"]+)":') AS custom_keys
FROM
json_table
),
distinct_keys AS (
SELECT DISTINCT key
FROM keys_extracted,
UNNEST(semantic_keys) AS key
WHERE key NOT IN ('products') -- Add potential nested key
UNION DISTINCT
SELECT DISTINCT key
FROM keys_extracted,
UNNEST(custom_keys) AS key
)
SELECT
CONCAT('("', STRING_AGG(key, '", "'), '")') AS keys
FROM distinct_keys
);
-- Set the dynamic struct fields for the final SELECT
SET struct_fields = (
WITH json_table AS (
SELECT
Event_Date, Event_Datetime,
Airbridge_Device_ID, Airbridge_Device_ID_Type, User_ID,
Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Category, Event_Label, Event_Action, Event_Value,
CASE
WHEN Custom_Event_Properties = '{}' THEN '{"tester": "true"}'
ELSE Custom_Event_Properties
END AS Custom_Event_Properties,
Semantic_Event_Properties
FROM
`airbridge_lake.app_2024`
WHERE
Event_Date = '2024-07-13'
),
keys_extracted AS (
SELECT
REGEXP_EXTRACT_ALL(Semantic_Event_Properties, r'"([^"]+)":') AS semantic_keys,
REGEXP_EXTRACT_ALL(Custom_Event_Properties, r'"([^"]+)":') AS custom_keys
FROM
json_table
),
distinct_keys AS (
SELECT DISTINCT key
FROM keys_extracted,
UNNEST(semantic_keys) AS key
WHERE key NOT IN ('products') -- Add potential nested key
UNION DISTINCT
SELECT DISTINCT key
FROM keys_extracted,
UNNEST(custom_keys) AS key
)
SELECT
STRING_AGG(CONCAT('IFNULL(', key, ', NULL) AS ', key), ', ') AS keys
FROM distinct_keys
);
-- Check if key_list or struct_fields are NULL
IF key_list IS NULL THEN
SET key_list = '("dummy_key")';
END IF;
IF struct_fields IS NULL THEN
SET struct_fields = 'IFNULL(dummy_key, NULL) AS dummy_key';
END IF;
-- ---------------------------------------------------------------- #
-- Execute dynamic pivot query
-- ---------------------------------------------------------------- #
EXECUTE IMMEDIATE FORMAT("""
WITH json_table AS (
SELECT
Event_Date, Event_Datetime,
Airbridge_Device_ID, Airbridge_Device_ID_Type, User_ID,
Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Category, Event_Label, Event_Action, Event_Value,
CASE
WHEN Custom_Event_Properties = '{}' THEN '{"tester": "true"}'
ELSE Custom_Event_Properties
END AS Custom_Event_Properties,
Semantic_Event_Properties
FROM `airbridge_lake.app_2024`
WHERE Event_Date = '2024-07-13'
),
-- ---------------------------------------------------------------- #
-- Extract and handle duplicate JSON keys
-- ---------------------------------------------------------------- #
keys_extracted AS (
SELECT
*,
REGEXP_EXTRACT_ALL(Semantic_Event_Properties, r'"([^"]+)":') AS semantic_keys,
REGEXP_EXTRACT_ALL(Custom_Event_Properties, r'"([^"]+)":') AS custom_keys
FROM
json_table
),
filtered_keys AS (
SELECT
*,
ARRAY(
SELECT key FROM UNNEST(semantic_keys) AS key
WHERE key NOT IN ('products')
UNION DISTINCT
SELECT key FROM UNNEST(custom_keys) AS key
) AS filtered_keys
FROM
keys_extracted
),
-- ---------------------------------------------------------------- #
-- Create key table
-- 1. Use filtered_keys array to create key column
-- 2. Use UDF to parse JSON
-- ---------------------------------------------------------------- #
unnested_semantic_keys AS (
SELECT
Event_Date, Event_Datetime,
Airbridge_Device_ID, Airbridge_Device_ID_Type, User_ID,
Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Category, Event_Label, Event_Action, Event_Value,
Semantic_Event_Properties, Custom_Event_Properties,
key,
extract_key_value(Semantic_Event_Properties, key) AS value
FROM
filtered_keys,
UNNEST(filtered_keys.filtered_keys) AS key
),
unnested_custom_keys AS (
SELECT
Event_Date, Event_Datetime,
Airbridge_Device_ID, Airbridge_Device_ID_Type, User_ID,
Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Category, Event_Label, Event_Action, Event_Value,
Semantic_Event_Properties, Custom_Event_Properties,
key,
extract_key_value(Custom_Event_Properties, key) AS value
FROM
filtered_keys,
UNNEST(filtered_keys.filtered_keys) AS key
),
-- ---------------------------------------------------------------- #
-- Concatenate key tables
-- ---------------------------------------------------------------- #
combined_keys AS (
SELECT
Event_Date, Event_Datetime,
Airbridge_Device_ID, Airbridge_Device_ID_Type, User_ID,
Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Category, Event_Label, Event_Action, Event_Value,
Semantic_Event_Properties, Custom_Event_Properties,
key,
extract_key_value(Semantic_Event_Properties, key) AS value
FROM
unnested_semantic_keys
UNION ALL
SELECT
Event_Date, Event_Datetime,
Airbridge_Device_ID, Airbridge_Device_ID_Type, User_ID,
Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Category, Event_Label, Event_Action, Event_Value,
Semantic_Event_Properties, Custom_Event_Properties,
key,
extract_key_value(Custom_Event_Properties, key) AS value
FROM
unnested_custom_keys
),
-- ---------------------------------------------------------------- #
-- Create Products Nested JSON table
-- 1. Handle 2-level JSON, null values separately (branch)
-- 2. Merge after creation
-- ---------------------------------------------------------------- #
product_keys_extracted AS (
SELECT
Event_Date, Event_Datetime, Airbridge_Device_ID, Event_Category, Event_Label, Event_Action, Event_Value,
IFNULL(JSON_EXTRACT_ARRAY(Semantic_Event_Properties, '$.products'), []) AS products
FROM
json_table
),
products_unnested AS (
SELECT
Event_Date, Event_Datetime, Airbridge_Device_ID, Event_Category, Event_Label, Event_Action, Event_Value,
JSON_EXTRACT_SCALAR(product, '$.name') AS name,
JSON_EXTRACT_SCALAR(product, '$.price') AS price,
JSON_EXTRACT_SCALAR(product, '$.position') AS position
FROM
product_keys_extracted,
UNNEST(products) AS product
),
products_struct AS (
SELECT
Event_Date, Event_Datetime, Airbridge_Device_ID, Event_Category, Event_Label, Event_Action, Event_Value,
ARRAY_AGG(STRUCT(name as name, price as price, position as position)) AS products_struct
FROM products_unnested
GROUP BY Event_Date, Event_Datetime, Airbridge_Device_ID, Event_Category, Event_Label, Event_Action, Event_Value
),
-- ---------------------------------------------------------------- #
-- Create pivot table
-- 1. Select table with key, value columns parsed from Semantic, Custom JSON
-- 2. Pivot key-value using declared variables.
-- ---------------------------------------------------------------- #
pivot_table AS (
SELECT *
FROM combined_keys
PIVOT (
ANY_VALUE(value) FOR key IN %s
)
),
-- ---------------------------------------------------------------- #
-- Create merged table
-- 1. Select pivot table parsed from Semantic, Custom JSON
-- 2. Select table parsed and structured from Product Nested JSON in Semantic
-- 3. Merge based on key values
-- ---------------------------------------------------------------- #
merged_table AS (
SELECT
table_1.Event_Date, table_1.Event_Datetime,
table_1.Airbridge_Device_ID, table_1.Airbridge_Device_ID_Type, table_1.User_ID,
table_1.Campaign_ID, table_1.Ad_Group_ID, table_1.Ad_Creative_ID, table_1.Term_ID,
table_1.Device_Model, table_1.Device_Type, table_1.Platform, table_1.Client_IP_Country_Code, table_1.Client_IP_Subdivision, table_1.Client_IP_City,
table_1.Is_Re_engagement, table_1.Is_First_Event_per_User_ID, table_1.Is_First_Event_per_Device_ID,
table_1.Is_First_Target_Event_per_Device, table_1.Target_Event_Timestamp, table_1.Target_Event_Category,
table_1.Event_Category, table_1.Event_Label, table_1.Event_Action, table_1.Event_Value,
table_1.Semantic_Event_Properties, table_1.Custom_Event_Properties,
table_2.products_struct AS products_str,
table_1.* EXCEPT (
Event_Date, Event_Datetime,
Airbridge_Device_ID, Airbridge_Device_ID_Type, User_ID,
Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Category, Event_Label, Event_Action, Event_Value, Semantic_Event_Properties, Custom_Event_Properties
),
table_2.* EXCEPT (
Event_Date, Event_Datetime, Airbridge_Device_ID, Event_Category, Event_Label, Event_Action, Event_Value
)
FROM
pivot_table as table_1
LEFT JOIN
products_struct as table_2
ON table_1.Event_Date = table_2.Event_Date AND
table_1.Event_Datetime = table_2.Event_Datetime AND
table_1.Airbridge_Device_ID = table_2.Airbridge_Device_ID AND
table_1.Event_Category = table_2.Event_Category AND
table_1.Event_Label = table_2.Event_Label AND
table_1.Event_Action = table_2.Event_Action AND
table_1.Event_Value = table_2.Event_Value
),
-- ---------------------------------------------------------------- #
-- Order merged table (no significant functionality)
-- ---------------------------------------------------------------- #
order_table AS (
SELECT
Event_Date, Airbridge_Device_ID, Airbridge_Device_ID_Type,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
User_ID,
Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Datetime, Event_Category, Event_Label, Event_Action, Event_Value,
merged_table.* EXCEPT (
Event_Date, Airbridge_Device_ID, Airbridge_Device_ID_Type,
Device_Model, Device_Type, Platform, Client_IP_Country_Code, Client_IP_Subdivision, Client_IP_City,
User_ID, Campaign_ID, Ad_Group_ID, Ad_Creative_ID, Term_ID,
Is_Re_engagement, Is_First_Event_per_User_ID, Is_First_Event_per_Device_ID,
Is_First_Target_Event_per_Device, Target_Event_Timestamp, Target_Event_Category,
Event_Datetime, Event_Category, Event_Label, Event_Action, Event_Value
)
FROM merged_table
),
-- ---------------------------------------------------------------- #
-- Final table
-- 1. Create 2-level struct for semantic and custom columns
-- 2. Group by Event Action, Event Label by Date and Timestamp
-- 3. Create Value and Count features for each event after 2-level group by
-- 4. Create 1:N struct for User_ID
-- 5. Create structs for device info, campaign info, etc.
-- ---------------------------------------------------------------- #
fin_table AS(
SELECT
Event_Date, Airbridge_Device_ID, Airbridge_Device_ID_Type,
-- Create struct for other information --
STRUCT(
min(Device_Model) as Device_Model ,min(Device_Type) as Device_Type ,min(Platform) as Platform ,min(Client_IP_Country_Code) as Client_IP_Country_Code ,min(Client_IP_Subdivision) as Client_IP_Subdivision ,min(Client_IP_City) as Client_IP_City
,min(Campaign_ID) as Campaign_ID ,min(Ad_Group_ID) as Ad_Group_ID ,min(Ad_Creative_ID) as Ad_Creative_ID ,min(Term_ID) as Term_ID
,min(Is_Re_engagement) as Is_Re_engagement, min(Is_First_Event_per_User_ID) as Is_First_Event_per_User_ID, min(Is_First_Event_per_Device_ID) as Is_First_Event_per_Device_ID
,min(Is_First_Target_Event_per_Device) as Is_First_Target_Event_per_Device
,min(Target_Event_Timestamp) as Target_Event_Timestamp, min(Target_Event_Category) as Target_Event_Category
) AS i,
Event_Category,SUM(Event_Value_Sum) as Event_Value_Total, SUM(Event_Count) as Event_Count_Total,
-- Create struct for event details. This is the outer struct of the 2-level struct --
ARRAY_AGG(
STRUCT(
Event_Label as Label, Event_Action as Action, Event_Value_Sum as Event_Value_Sum, Event_Count, u as u, d as d
)
) as e
FROM(
-- Start subquery --
SELECT
Event_Date, Airbridge_Device_ID, Airbridge_Device_ID_Type,
-- Create struct for User ID --
ARRAY_AGG(
STRUCT(
User_ID as User_ID
)
) as u,
Event_Category, Event_Label, Event_Action, SUM(Event_Value) AS Event_Value_Sum, COUNT(*) AS Event_Count,
min(Device_Model) as Device_Model ,min(Device_Type) as Device_Type ,min(Platform) as Platform ,min(Client_IP_Country_Code) as Client_IP_Country_Code ,min(Client_IP_Subdivision) as Client_IP_Subdivision ,min(Client_IP_City) as Client_IP_City
,min(Campaign_ID) as Campaign_ID ,min(Ad_Group_ID) as Ad_Group_ID ,min(Ad_Creative_ID) as Ad_Creative_ID ,min(Term_ID) as Term_ID
,min(Is_Re_engagement) as Is_Re_engagement, min(Is_First_Event_per_User_ID) as Is_First_Event_per_User_ID, min(Is_First_Event_per_Device_ID) as Is_First_Event_per_Device_ID
,min(Is_First_Target_Event_per_Device) as Is_First_Target_Event_per_Device
,min(Target_Event_Timestamp) as Target_Event_Timestamp, min(Target_Event_Category) as Target_Event_Category,
-- Create struct for event details. This is the inner struct of the 2-level struct --
ARRAY_AGG(STRUCT(
Event_Datetime as Timestamp ,Event_Label as Label, Event_Action as Action, Event_Value as Value, %s, products_str
)) AS d
FROM order_table
GROUP BY Event_Date, Airbridge_Device_ID, Airbridge_Device_ID_Type, Event_Category, Event_Label, Event_Action
) AS inn
GROUP BY Event_Date, Airbridge_Device_ID, Airbridge_Device_ID_Type, Event_Category
)
select * from fin_table
-- Use declared variables --
""", key_list, struct_fields);
END;
can’t figure out the problems
Young is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.