This is the piece of code that’s giving correct output but performancewise is very worst.
Is there a way where ‘modelled_nbd_item_sub_cat_map’ table used only once and get the same result?
<code>postx_fact = spark.sql(f"""select panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,
case
when upper(it_co.sub_cat_code) like '%GBY%' then co_synd_item_dim_key
when synd.item_dim_key > 0 and synd.map_cw_subcategory is null then synd.item_dim_key
when de.item_dim_key > 0 and de.map_cw_subcategory is null then de.item_dim_key
when bc_ws.item_dim_key > 0 and bc_ws.map_cw_subcategory is null then bc_ws.item_dim_key
when pos_wkly.item_dim_key > 0 and pos_wkly.map_cw_subcategory is null then pos_wkly.item_dim_key
when pos_dly.item_dim_key > 0 and pos_dly.map_cw_subcategory is null then pos_dly.item_dim_key
when it_co.item_dim_key is not null and not (vn.rp_outlet='CONVENIENCE' and upper(nvl(fct.custom_col2,'NO'))='CAN/BOTTLE')
then it_co.item_dim_key
when se.item_dim_key > 0 and se.map_cw_subcategory is null then se.item_dim_key
else 922481888
end as item_dim_key_join_column
from {MAIN_SCHEMA2}.postx_fact_merge_with_ecomm_with_co fct
left outer join {TARGET_SCHEMA}.it_co_eal_v3 it_co on it_co.item_dim_key = fct.co_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map synd on synd.item_dim_key = fct.synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map de on de.item_dim_key = fct.de_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map bc_ws on bc_ws.item_dim_key = fct.bc_ws_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map pos_wkly on pos_wkly.item_dim_key = fct.pos_wkly_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map pos_dly on pos_dly.item_dim_key = fct.pos_dly_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map se on se.item_dim_key = fct.se_synd_item_dim_key
left outer join {TARGET_SCHEMA}.cmpltwlt_vn_eal_mstr vn on vn.venue_dim_key = fct.synd_venue_dim_Key where tm_dim_key_week between {START_WEEK} AND {END_WEEK}""")
</code>
<code>postx_fact = spark.sql(f"""select panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,
case
when upper(it_co.sub_cat_code) like '%GBY%' then co_synd_item_dim_key
when synd.item_dim_key > 0 and synd.map_cw_subcategory is null then synd.item_dim_key
when de.item_dim_key > 0 and de.map_cw_subcategory is null then de.item_dim_key
when bc_ws.item_dim_key > 0 and bc_ws.map_cw_subcategory is null then bc_ws.item_dim_key
when pos_wkly.item_dim_key > 0 and pos_wkly.map_cw_subcategory is null then pos_wkly.item_dim_key
when pos_dly.item_dim_key > 0 and pos_dly.map_cw_subcategory is null then pos_dly.item_dim_key
when it_co.item_dim_key is not null and not (vn.rp_outlet='CONVENIENCE' and upper(nvl(fct.custom_col2,'NO'))='CAN/BOTTLE')
then it_co.item_dim_key
when se.item_dim_key > 0 and se.map_cw_subcategory is null then se.item_dim_key
else 922481888
end as item_dim_key_join_column
from {MAIN_SCHEMA2}.postx_fact_merge_with_ecomm_with_co fct
left outer join {TARGET_SCHEMA}.it_co_eal_v3 it_co on it_co.item_dim_key = fct.co_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map synd on synd.item_dim_key = fct.synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map de on de.item_dim_key = fct.de_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map bc_ws on bc_ws.item_dim_key = fct.bc_ws_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map pos_wkly on pos_wkly.item_dim_key = fct.pos_wkly_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map pos_dly on pos_dly.item_dim_key = fct.pos_dly_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map se on se.item_dim_key = fct.se_synd_item_dim_key
left outer join {TARGET_SCHEMA}.cmpltwlt_vn_eal_mstr vn on vn.venue_dim_key = fct.synd_venue_dim_Key where tm_dim_key_week between {START_WEEK} AND {END_WEEK}""")
</code>
postx_fact = spark.sql(f"""select panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,
case
when upper(it_co.sub_cat_code) like '%GBY%' then co_synd_item_dim_key
when synd.item_dim_key > 0 and synd.map_cw_subcategory is null then synd.item_dim_key
when de.item_dim_key > 0 and de.map_cw_subcategory is null then de.item_dim_key
when bc_ws.item_dim_key > 0 and bc_ws.map_cw_subcategory is null then bc_ws.item_dim_key
when pos_wkly.item_dim_key > 0 and pos_wkly.map_cw_subcategory is null then pos_wkly.item_dim_key
when pos_dly.item_dim_key > 0 and pos_dly.map_cw_subcategory is null then pos_dly.item_dim_key
when it_co.item_dim_key is not null and not (vn.rp_outlet='CONVENIENCE' and upper(nvl(fct.custom_col2,'NO'))='CAN/BOTTLE')
then it_co.item_dim_key
when se.item_dim_key > 0 and se.map_cw_subcategory is null then se.item_dim_key
else 922481888
end as item_dim_key_join_column
from {MAIN_SCHEMA2}.postx_fact_merge_with_ecomm_with_co fct
left outer join {TARGET_SCHEMA}.it_co_eal_v3 it_co on it_co.item_dim_key = fct.co_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map synd on synd.item_dim_key = fct.synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map de on de.item_dim_key = fct.de_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map bc_ws on bc_ws.item_dim_key = fct.bc_ws_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map pos_wkly on pos_wkly.item_dim_key = fct.pos_wkly_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map pos_dly on pos_dly.item_dim_key = fct.pos_dly_synd_item_dim_key
left outer join {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map se on se.item_dim_key = fct.se_synd_item_dim_key
left outer join {TARGET_SCHEMA}.cmpltwlt_vn_eal_mstr vn on vn.venue_dim_key = fct.synd_venue_dim_Key where tm_dim_key_week between {START_WEEK} AND {END_WEEK}""")
I have tried using explode and collect list but not sure how to develop the case statment like the original one and also not sure how to avoid double counting in fact values
<code>select panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,co_synd_item_dim_key,collect_list(exp_temp_item_dim_keys) as collect_exp_temp_item_dim_keys,collect_list(item_dim_key) as coalesce_item_dim_key,collect_list(item_dim_key)[5] as se_synd_item_dim_key_join_col from
(select *
from (select panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,co_synd_item_dim_key,array(synd_item_dim_key,de_synd_item_dim_key,bc_ws_synd_item_dim_key,pos_wkly_synd_item_dim_key,pos_dly_synd_item_dim_key,se_synd_item_dim_key) as temp_item_dim_keys, explode(array(synd_item_dim_key,de_synd_item_dim_key,bc_ws_synd_item_dim_key,pos_wkly_synd_item_dim_key,pos_dly_synd_item_dim_key,se_synd_item_dim_key)) as exp_temp_item_dim_keys
from {MAIN_SCHEMA2}.postx_fact_merge_with_ecomm_with_co where tm_dim_key_week between {START_WEEK} AND {END_WEEK}) fct
left outer join (select item_dim_key from {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map where map_cw_subcategory is null ) sub_cat_map
on fct.exp_temp_item_dim_keys = sub_cat_map.item_dim_key) a group by panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,co_synd_item_dim_key
</code>
<code>select panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,co_synd_item_dim_key,collect_list(exp_temp_item_dim_keys) as collect_exp_temp_item_dim_keys,collect_list(item_dim_key) as coalesce_item_dim_key,collect_list(item_dim_key)[5] as se_synd_item_dim_key_join_col from
(select *
from (select panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,co_synd_item_dim_key,array(synd_item_dim_key,de_synd_item_dim_key,bc_ws_synd_item_dim_key,pos_wkly_synd_item_dim_key,pos_dly_synd_item_dim_key,se_synd_item_dim_key) as temp_item_dim_keys, explode(array(synd_item_dim_key,de_synd_item_dim_key,bc_ws_synd_item_dim_key,pos_wkly_synd_item_dim_key,pos_dly_synd_item_dim_key,se_synd_item_dim_key)) as exp_temp_item_dim_keys
from {MAIN_SCHEMA2}.postx_fact_merge_with_ecomm_with_co where tm_dim_key_week between {START_WEEK} AND {END_WEEK}) fct
left outer join (select item_dim_key from {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map where map_cw_subcategory is null ) sub_cat_map
on fct.exp_temp_item_dim_keys = sub_cat_map.item_dim_key) a group by panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,co_synd_item_dim_key
</code>
select panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,co_synd_item_dim_key,collect_list(exp_temp_item_dim_keys) as collect_exp_temp_item_dim_keys,collect_list(item_dim_key) as coalesce_item_dim_key,collect_list(item_dim_key)[5] as se_synd_item_dim_key_join_col from
(select *
from (select panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,co_synd_item_dim_key,array(synd_item_dim_key,de_synd_item_dim_key,bc_ws_synd_item_dim_key,pos_wkly_synd_item_dim_key,pos_dly_synd_item_dim_key,se_synd_item_dim_key) as temp_item_dim_keys, explode(array(synd_item_dim_key,de_synd_item_dim_key,bc_ws_synd_item_dim_key,pos_wkly_synd_item_dim_key,pos_dly_synd_item_dim_key,se_synd_item_dim_key)) as exp_temp_item_dim_keys
from {MAIN_SCHEMA2}.postx_fact_merge_with_ecomm_with_co where tm_dim_key_week between {START_WEEK} AND {END_WEEK}) fct
left outer join (select item_dim_key from {TARGET_SCHEMA}.modelled_nbd_item_sub_cat_map where map_cw_subcategory is null ) sub_cat_map
on fct.exp_temp_item_dim_keys = sub_cat_map.item_dim_key) a group by panelist_id,tm_dim_key_week,synd_venue_dim_key, fact_bkey, derived_item_net_amount, derived_item_quantity,co_synd_item_dim_key