I’m trying to create a few models with python to predict city collaboration with companies, but I keep having problems in the data parsing part, I tried making a random forest model but I had issues with reading the columns, even though I cleaned the data and changed the column names to be the exact same in each file, how I can move past this?
These are the datasets I’m using:
text
<code>import pandas as pd
import matplotlib.pyplot as plt
# Load datasets
cities_disclosing_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Predicting-city-collaboration-with-business/Datasets/Data/Cities/Cities Disclosing/2020_Cities_Disclosing_to_CDP.csv")
corp_climate_change_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Corporations/Corporations Disclosing/Climate Change/2020_Corporates_Disclosing_to_CDP_Climate_Change.csv")
# Load datasets
cities_disclosing_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Predicting-city-collaboration-with-business/Datasets/Data/Cities/Cities Disclosing/2020_Cities_Disclosing_to_CDP.csv")
cities_responses_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Cities/Cities Responses/2020_Full_Cities_Dataset.csv")
corp_climate_change_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Corporations/Corporations Disclosing/Climate Change/2020_Corporates_Disclosing_to_CDP_Climate_Change.csv")
corp_water_security_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Corporations/Corporations Disclosing/Water Security/2020_Corporates_Disclosing_to_CDP_Water_Security.csv")
# Merge datasets
merged_2020 = pd.merge(cities_disclosing_2020, corp_climate_change_2020, on=['Account_Number', 'Year'], suffixes=('_city', '_corp_climate'))
merged_2020 = pd.merge(merged_2020, corp_water_security_2020, on=['Account_Number', 'Year'], suffixes=('', '_corp_water'))
merged_responses_2020 = pd.merge(cities_responses_2020, corp_climate_change_2020, on=['Account_Number', 'Year'], suffixes=('_city', '_corp_climate'))
merged_responses_2020 = pd.merge(merged_responses_2020, corp_water_security_2020, on=['Account_Number', 'Year'], suffixes=('', '_corp_water'))
# Calculate collaboration rates
merged_2020['climate_change_collaboration'] = (merged_2020['theme_city'] == merged_2020['theme_corp_climate']).astype(int)
merged_2020['water_security_collaboration'] = (merged_2020['theme_city'] == merged_2020['theme_corp_water']).astype(int)
# Calculate impact
merged_responses_2020['impact'] = merged_responses_2020['Response Answer_city'].apply(lambda x: len(str(x)))
# Calculate collaboration rates and average impact
climate_change_collab_rate = merged_2020['climate_change_collaboration'].mean()
water_security_collab_rate = merged_2020['water_security_collaboration'].mean()
average_impact_2020 = merged_responses_2020['impact'].mean()
print(f"Climate Change Collaboration Rate in 2020: {climate_change_collab_rate}")
print(f"Water Security Collaboration Rate in 2020: {water_security_collab_rate}")
print(f"Average Impact on Cities in 2020: {average_impact_2020}")
# Bar plot for collaboration rates
collab_rates = {
'Climate Change': climate_change_collab_rate,
'Water Security': water_security_collab_rate
}
plt.bar(collab_rates.keys(), collab_rates.values())
plt.title('Collaboration Rates in 2020')
plt.ylabel('Rate')
for i, rate in enumerate(collab_rates.values()):
plt.text(i, rate + 0.01, f'{rate:.2f}', ha='center', va='bottom')
plt.show()
# Histogram for impact distribution
plt.hist(merged_responses_2020['impact'], bins=20, edgecolor='black')
plt.title('Impact Distribution in 2020')
plt.xlabel('Impact')
plt.ylabel('Frequency')
plt.show()
</code>
<code>import pandas as pd
import matplotlib.pyplot as plt
# Load datasets
cities_disclosing_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Predicting-city-collaboration-with-business/Datasets/Data/Cities/Cities Disclosing/2020_Cities_Disclosing_to_CDP.csv")
corp_climate_change_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Corporations/Corporations Disclosing/Climate Change/2020_Corporates_Disclosing_to_CDP_Climate_Change.csv")
# Load datasets
cities_disclosing_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Predicting-city-collaboration-with-business/Datasets/Data/Cities/Cities Disclosing/2020_Cities_Disclosing_to_CDP.csv")
cities_responses_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Cities/Cities Responses/2020_Full_Cities_Dataset.csv")
corp_climate_change_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Corporations/Corporations Disclosing/Climate Change/2020_Corporates_Disclosing_to_CDP_Climate_Change.csv")
corp_water_security_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Corporations/Corporations Disclosing/Water Security/2020_Corporates_Disclosing_to_CDP_Water_Security.csv")
# Merge datasets
merged_2020 = pd.merge(cities_disclosing_2020, corp_climate_change_2020, on=['Account_Number', 'Year'], suffixes=('_city', '_corp_climate'))
merged_2020 = pd.merge(merged_2020, corp_water_security_2020, on=['Account_Number', 'Year'], suffixes=('', '_corp_water'))
merged_responses_2020 = pd.merge(cities_responses_2020, corp_climate_change_2020, on=['Account_Number', 'Year'], suffixes=('_city', '_corp_climate'))
merged_responses_2020 = pd.merge(merged_responses_2020, corp_water_security_2020, on=['Account_Number', 'Year'], suffixes=('', '_corp_water'))
# Calculate collaboration rates
merged_2020['climate_change_collaboration'] = (merged_2020['theme_city'] == merged_2020['theme_corp_climate']).astype(int)
merged_2020['water_security_collaboration'] = (merged_2020['theme_city'] == merged_2020['theme_corp_water']).astype(int)
# Calculate impact
merged_responses_2020['impact'] = merged_responses_2020['Response Answer_city'].apply(lambda x: len(str(x)))
# Calculate collaboration rates and average impact
climate_change_collab_rate = merged_2020['climate_change_collaboration'].mean()
water_security_collab_rate = merged_2020['water_security_collaboration'].mean()
average_impact_2020 = merged_responses_2020['impact'].mean()
print(f"Climate Change Collaboration Rate in 2020: {climate_change_collab_rate}")
print(f"Water Security Collaboration Rate in 2020: {water_security_collab_rate}")
print(f"Average Impact on Cities in 2020: {average_impact_2020}")
# Bar plot for collaboration rates
collab_rates = {
'Climate Change': climate_change_collab_rate,
'Water Security': water_security_collab_rate
}
plt.bar(collab_rates.keys(), collab_rates.values())
plt.title('Collaboration Rates in 2020')
plt.ylabel('Rate')
for i, rate in enumerate(collab_rates.values()):
plt.text(i, rate + 0.01, f'{rate:.2f}', ha='center', va='bottom')
plt.show()
# Histogram for impact distribution
plt.hist(merged_responses_2020['impact'], bins=20, edgecolor='black')
plt.title('Impact Distribution in 2020')
plt.xlabel('Impact')
plt.ylabel('Frequency')
plt.show()
</code>
import pandas as pd
import matplotlib.pyplot as plt
# Load datasets
cities_disclosing_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Predicting-city-collaboration-with-business/Datasets/Data/Cities/Cities Disclosing/2020_Cities_Disclosing_to_CDP.csv")
corp_climate_change_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Corporations/Corporations Disclosing/Climate Change/2020_Corporates_Disclosing_to_CDP_Climate_Change.csv")
# Load datasets
cities_disclosing_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Predicting-city-collaboration-with-business/Datasets/Data/Cities/Cities Disclosing/2020_Cities_Disclosing_to_CDP.csv")
cities_responses_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Cities/Cities Responses/2020_Full_Cities_Dataset.csv")
corp_climate_change_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Corporations/Corporations Disclosing/Climate Change/2020_Corporates_Disclosing_to_CDP_Climate_Change.csv")
corp_water_security_2020 = pd.read_csv("C:/Users/User/OneDrive/Documents/Data/Corporations/Corporations Disclosing/Water Security/2020_Corporates_Disclosing_to_CDP_Water_Security.csv")
# Merge datasets
merged_2020 = pd.merge(cities_disclosing_2020, corp_climate_change_2020, on=['Account_Number', 'Year'], suffixes=('_city', '_corp_climate'))
merged_2020 = pd.merge(merged_2020, corp_water_security_2020, on=['Account_Number', 'Year'], suffixes=('', '_corp_water'))
merged_responses_2020 = pd.merge(cities_responses_2020, corp_climate_change_2020, on=['Account_Number', 'Year'], suffixes=('_city', '_corp_climate'))
merged_responses_2020 = pd.merge(merged_responses_2020, corp_water_security_2020, on=['Account_Number', 'Year'], suffixes=('', '_corp_water'))
# Calculate collaboration rates
merged_2020['climate_change_collaboration'] = (merged_2020['theme_city'] == merged_2020['theme_corp_climate']).astype(int)
merged_2020['water_security_collaboration'] = (merged_2020['theme_city'] == merged_2020['theme_corp_water']).astype(int)
# Calculate impact
merged_responses_2020['impact'] = merged_responses_2020['Response Answer_city'].apply(lambda x: len(str(x)))
# Calculate collaboration rates and average impact
climate_change_collab_rate = merged_2020['climate_change_collaboration'].mean()
water_security_collab_rate = merged_2020['water_security_collaboration'].mean()
average_impact_2020 = merged_responses_2020['impact'].mean()
print(f"Climate Change Collaboration Rate in 2020: {climate_change_collab_rate}")
print(f"Water Security Collaboration Rate in 2020: {water_security_collab_rate}")
print(f"Average Impact on Cities in 2020: {average_impact_2020}")
# Bar plot for collaboration rates
collab_rates = {
'Climate Change': climate_change_collab_rate,
'Water Security': water_security_collab_rate
}
plt.bar(collab_rates.keys(), collab_rates.values())
plt.title('Collaboration Rates in 2020')
plt.ylabel('Rate')
for i, rate in enumerate(collab_rates.values()):
plt.text(i, rate + 0.01, f'{rate:.2f}', ha='center', va='bottom')
plt.show()
# Histogram for impact distribution
plt.hist(merged_responses_2020['impact'], bins=20, edgecolor='black')
plt.title('Impact Distribution in 2020')
plt.xlabel('Impact')
plt.ylabel('Frequency')
plt.show()
New contributor
user26853208 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
0